Пример #1
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
Пример #2
0
class Metapath2VecTrainer:
    def __init__(self, args):
        if args.aminer:
            dataset = AminerDataset(args.path)
        else:
            dataset = CustomDataset(args.path)
        self.data = DataReader(dataset, args.min_count, args.care_type)
        dataset = Metapath2vecDataset(self.data, args.window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)

        self.output_file_name = args.output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):

        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 500 == 0:
                        print(" Loss: " + str(running_loss))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Пример #3
0
def train(args):

    data = InputData(args.input, args.min_count, args.sample)
    output_file_name = args.output
    emb_size = len(data.word2id)
    emb_dimension = args.dim
    batch_size = args.mb
    window_size = args.window
    n_negs = args.n_negs
    iteration = args.iters
    initial_lr = args.lr
    use_cuda = args.cuda

    skip_gram_model = SkipGramModel(emb_size, emb_dimension)
    if use_cuda: skip_gram_model = skip_gram_model.cuda()

    optimizer = optim.SGD(skip_gram_model.parameters(), lr=initial_lr)

    pair_count = data.evaluate_pair_count(window_size)
    batch_count = iteration * pair_count / batch_size
    process_bar = tqdm(range(int(batch_count)))

    # skip_gram_model.save_embedding(
    #     data.id2word, 'begin_embedding.txt', use_cuda)

    for i in process_bar:
        pos_pairs = data.get_batch_pairs(batch_size, window_size)
        neg_v = data.get_neg_v_neg_sampling(pos_pairs, n_negs)
        pos_u = [pair[0] for pair in pos_pairs]
        pos_v = [pair[1] for pair in pos_pairs]

        pos_u = torch.LongTensor(pos_u)
        pos_v = torch.LongTensor(pos_v)
        neg_v = torch.LongTensor(neg_v)
        if use_cuda:
            pos_u = pos_u.cuda()
            pos_v = pos_v.cuda()
            neg_v = neg_v.cuda()

        optimizer.zero_grad()
        loss = skip_gram_model(pos_u, pos_v, neg_v)
        loss.backward()
        optimizer.step()

        process_bar.set_description(
            "\rLoss: %0.8f, lr: %0.6f" %
            (loss.item(), optimizer.param_groups[0]['lr']))

        if i * batch_size % 100000 == 0:
            lr = initial_lr * (1.0 - 1.0 * i / batch_count)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

    skip_gram_model.save_embedding(data.id2word, output_file_name, use_cuda)
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Пример #5
0
Файл: line.py Проект: yuk12/dgl
class LineTrainer:
    def __init__(self, args):
        """ Initializing the trainer with the input arguments """
        self.args = args
        self.dataset = LineDataset(
            net_file=args.data_file,
            batch_size=args.batch_size,
            negative=args.negative,
            gpus=args.gpus,
            fast_neg=args.fast_neg,
            ogbl_name=args.ogbl_name,
            load_from_ogbl=args.load_from_ogbl,
            ogbn_name=args.ogbn_name,
            load_from_ogbn=args.load_from_ogbn,
            num_samples=args.num_samples * 1000000,
        )
        self.emb_size = self.dataset.G.number_of_nodes()
        self.emb_model = None

    def init_device_emb(self):
        """ set the device before training 
        will be called once in fast_train_mp / fast_train
        """
        choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
        assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"

        # initializing embedding on CPU
        self.emb_model = SkipGramModel(
            emb_size=self.emb_size,
            emb_dimension=self.args.dim,
            batch_size=self.args.batch_size,
            only_cpu=self.args.only_cpu,
            only_gpu=self.args.only_gpu,
            only_fst=self.args.only_fst,
            only_snd=self.args.only_snd,
            mix=self.args.mix,
            neg_weight=self.args.neg_weight,
            negative=self.args.negative,
            lr=self.args.lr,
            lap_norm=self.args.lap_norm,
            fast_neg=self.args.fast_neg,
            record_loss=self.args.print_loss,
            async_update=self.args.async_update,
            num_threads=self.args.num_threads,
        )

        torch.set_num_threads(self.args.num_threads)
        if self.args.only_gpu:
            print("Run in 1 GPU")
            assert self.args.gpus[0] >= 0
            self.emb_model.all_to_device(self.args.gpus[0])
        elif self.args.mix:
            print("Mix CPU with %d GPU" % len(self.args.gpus))
            if len(self.args.gpus) == 1:
                assert self.args.gpus[
                    0] >= 0, 'mix CPU with GPU should have avaliable GPU'
                self.emb_model.set_device(self.args.gpus[0])
        else:
            print("Run in CPU process")

    def train(self):
        """ train the embedding """
        if len(self.args.gpus) > 1:
            self.fast_train_mp()
        else:
            self.fast_train()

    def fast_train_mp(self):
        """ multi-cpu-core or mix cpu & multi-gpu """
        self.init_device_emb()
        self.emb_model.share_memory()

        sum_up_params(self.emb_model)

        start_all = time.time()
        ps = []

        for i in range(len(self.args.gpus)):
            p = mp.Process(target=self.fast_train_sp,
                           args=(i, self.args.gpus[i]))
            ps.append(p)
            p.start()

        for p in ps:
            p.join()

        print("Used time: %.2fs" % (time.time() - start_all))
        if self.args.save_in_pt:
            self.emb_model.save_embedding_pt(self.dataset,
                                             self.args.output_emb_file)
        else:
            self.emb_model.save_embedding(self.dataset,
                                          self.args.output_emb_file)

    def fast_train_sp(self, rank, gpu_id):
        """ a subprocess for fast_train_mp """
        if self.args.mix:
            self.emb_model.set_device(gpu_id)

        torch.set_num_threads(self.args.num_threads)
        if self.args.async_update:
            self.emb_model.create_async_update()

        sampler = self.dataset.create_sampler(rank)

        dataloader = DataLoader(
            dataset=sampler.seeds,
            batch_size=self.args.batch_size,
            collate_fn=sampler.sample,
            shuffle=False,
            drop_last=False,
            num_workers=self.args.num_sampler_threads,
        )
        num_batches = len(dataloader)
        print("num batchs: %d in process [%d] GPU [%d]" %
              (num_batches, rank, gpu_id))

        start = time.time()
        with torch.no_grad():
            for i, edges in enumerate(dataloader):
                if self.args.fast_neg:
                    self.emb_model.fast_learn(edges)
                else:
                    # do negative sampling
                    bs = edges.size()[0]
                    neg_nodes = torch.LongTensor(
                        np.random.choice(self.dataset.neg_table,
                                         bs * self.args.negative,
                                         replace=True))
                    self.emb_model.fast_learn(edges, neg_nodes=neg_nodes)

                if i > 0 and i % self.args.print_interval == 0:
                    if self.args.print_loss:
                        if self.args.only_fst:
                            print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f" \
                                % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval))
                        elif self.args.only_snd:
                            print("GPU-[%d] batch %d time: %.2fs snd-loss: %.4f" \
                                % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval))
                        else:
                            print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \
                                % (gpu_id, i, time.time()-start, \
                                -sum(self.emb_model.loss_fst)/self.args.print_interval, \
                                -sum(self.emb_model.loss_snd)/self.args.print_interval))
                        self.emb_model.loss_fst = []
                        self.emb_model.loss_snd = []
                    else:
                        print("GPU-[%d] batch %d time: %.2fs" %
                              (gpu_id, i, time.time() - start))
                    start = time.time()

            if self.args.async_update:
                self.emb_model.finish_async_update()

    def fast_train(self):
        """ fast train with dataloader with only gpu / only cpu"""
        self.init_device_emb()

        if self.args.async_update:
            self.emb_model.share_memory()
            self.emb_model.create_async_update()

        sum_up_params(self.emb_model)

        sampler = self.dataset.create_sampler(0)

        dataloader = DataLoader(
            dataset=sampler.seeds,
            batch_size=self.args.batch_size,
            collate_fn=sampler.sample,
            shuffle=False,
            drop_last=False,
            num_workers=self.args.num_sampler_threads,
        )

        num_batches = len(dataloader)
        print("num batchs: %d\n" % num_batches)

        start_all = time.time()
        start = time.time()
        with torch.no_grad():
            for i, edges in enumerate(dataloader):
                if self.args.fast_neg:
                    self.emb_model.fast_learn(edges)
                else:
                    # do negative sampling
                    bs = edges.size()[0]
                    neg_nodes = torch.LongTensor(
                        np.random.choice(self.dataset.neg_table,
                                         bs * self.args.negative,
                                         replace=True))
                    self.emb_model.fast_learn(edges, neg_nodes=neg_nodes)

                if i > 0 and i % self.args.print_interval == 0:
                    if self.args.print_loss:
                        if self.args.only_fst:
                            print("Batch %d time: %.2fs fst-loss: %.4f" \
                                % (i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval))
                        elif self.args.only_snd:
                            print("Batch %d time: %.2fs snd-loss: %.4f" \
                                % (i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval))
                        else:
                            print("Batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \
                                % (i, time.time()-start, \
                                -sum(self.emb_model.loss_fst)/self.args.print_interval, \
                                -sum(self.emb_model.loss_snd)/self.args.print_interval))
                        self.emb_model.loss_fst = []
                        self.emb_model.loss_snd = []
                    else:
                        print("Batch %d, training time: %.2fs" %
                              (i, time.time() - start))
                    start = time.time()

            if self.args.async_update:
                self.emb_model.finish_async_update()

        print("Training used time: %.2fs" % (time.time() - start_all))
        if self.args.save_in_pt:
            self.emb_model.save_embedding_pt(self.dataset,
                                             self.args.output_emb_file)
        else:
            self.emb_model.save_embedding(self.dataset,
                                          self.args.output_emb_file)
Пример #6
0
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=300,
                 batch_size=64,
                 window_size=5,
                 iterations=5,
                 initial_lr=1.0,
                 min_count=5):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("USING CUDA")
            self.skip_gram_model.cuda()
        else:
            print("CUDA FAIL")

    def train(self):

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                  lr=self.initial_lr)
            # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    # scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.95 + loss.item() * 0.05
                    if i > 0 and i % 400 == 0:
                        print(" Loss: " + str(running_loss))

            self.skip_gram_model.save_embedding(
                self.data.id2word, self.output_file_name.format(iteration))
            self.initial_lr *= 0.7
Пример #7
0
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            count = 0

            for i, sample_batched in enumerate(self.dataloader):

                count += 1
                if count % 10000 == 0:
                    print("\n\nEpoch %d, %d batches processed" %
                          (iteration, count))

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 500 == 0:
                        print(" Loss: " + str(running_loss))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Пример #8
0
class Word2Vec:
    def __init__(
        self,
        input_file_name,
        input_wvectors,
        input_cvectors,
        input_ps,
        input_ns,
        output_file_name,
        emb_dimension=100,
        batch_size=50,
        window_size=5,
        kn=20,
        iteration=1,
        initial_lr=0.001,
        clip=1.0,
        min_count=30,
        batch_num_to_valid=100000,
    ):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            input_vectors: Pretrained vector
            input_psns: Pretrained positive sample & negative sample
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            kn: k neighbors.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.pre_wvectors = InputVector(input_wvectors)
        self.pre_cvectors = InputVector(input_cvectors)
        self.ps_w = load_from_pkl(input_ps)
        self.ns_w = load_from_pkl(input_ns)
        self.ps = convert_word_to_id(self.ps_w, self.data.word2id)
        self.ns = convert_word_to_id(self.ns_w, self.data.word2id)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.kn = kn
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.clip = clip
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension,
                                             self.pre_wvectors,
                                             self.pre_cvectors)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
        self.batch_num_to_valid = batch_num_to_valid

    def train(self, similarity_test_paths, synset_paths, analogy_paths):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)

        best_scores = dict()
        tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding')
        tmp_emb_path = os.path.join(
            tmp_emb_dir,
            ''.join(random.sample(string.ascii_letters + string.digits, 16)))

        for epoch in range(self.iteration):
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                      self.window_size)
                pos_u, mask_pos_u = self.data.get_ps_batch(
                    pos_pairs, self.ps, self.kn)
                neg_u, mask_neg_u = self.data.get_ns_batch(
                    pos_pairs, self.ns, self.kn)
                pair_u = [pair[0] for pair in pos_pairs]
                pair_v = [pair[1] for pair in pos_pairs]

                pair_u = Variable(torch.LongTensor(pair_u))
                pair_v = Variable(torch.LongTensor(pair_v))
                pos_u = Variable(torch.LongTensor(pos_u))
                mask_pos_u = Variable(torch.FloatTensor(mask_pos_u))
                neg_u = Variable(torch.LongTensor(neg_u))
                mask_neg_u = Variable(torch.FloatTensor(mask_neg_u))
                if self.use_cuda:
                    pair_u = pair_u.cuda()
                    pair_v = pair_v.cuda()
                    pos_u = pos_u.cuda()
                    mask_pos_u = mask_pos_u.cuda()
                    neg_u = neg_u.cuda()
                    mask_neg_u = mask_neg_u.cuda()

                self.optimizer.zero_grad()
                '''
                param = self.skip_gram_model.parameters()
                tmp = []
                try:
                    while True:
                        tmp.append(param.__next__())
                except:
                    pass
                '''
                loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u,
                                                    mask_pos_u, neg_u,
                                                    mask_neg_u)
                loss.backward()
                torch.nn.utils.clip_grad_norm(
                    self.skip_gram_model.parameters(), self.clip)
                self.optimizer.step()

                process_bar.set_description(
                    "Loss: %0.8f, lr: %0.6f" %
                    (loss.data[0], self.optimizer.param_groups[0]['lr']))
                if i * self.batch_size % 100000 == 0:
                    lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr

                if i % self.batch_num_to_valid == 0:
                    logging.info('epoch%d_batch%d, evaluating...' % (epoch, i))
                    self.save_embedding(self.data.id2word, tmp_emb_path,
                                        self.use_cuda)

                    best_scores, save_flag = evaluation(
                        tmp_emb_path, similarity_test_paths, synset_paths,
                        analogy_paths, best_scores)
                    if save_flag == True:
                        emb_save_path = self.output_file_name + "_epoch%d_batch%d" % (
                            epoch, i)
                        shutil.move(tmp_emb_path, emb_save_path)
                        logging.info('Save current embedding to %s' %
                                     emb_save_path)

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name,
                                                self.use_cuda)
            logging.info('final evaluating...')
            self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda)
            best_scores, save_flag = evaluation(tmp_emb_path,
                                                similarity_test_paths,
                                                synset_paths, analogy_paths,
                                                best_scores)
            if save_flag == True:
                emb_save_path = self.output_file_name + "_epoch%d" % epoch
                shutil.move(tmp_emb_path, emb_save_path)
                logging.info('Save current embedding to %s' % emb_save_path)

    def save_embedding(self, id2word, file_name, use_cuda):
        """Save all embeddings to file.

        As this class only record word id, so the map from id to word has to be transfered from outside.

        Args:
            id2word: map from word id to word.
            file_name: file name.
        Returns:
            None.
        """
        if use_cuda:
            embedding = self.skip_gram_model.u_embeddings.weight.cpu(
            ).data.numpy()
        else:
            embedding = self.skip_gram_model.u_embeddings.weight.data.numpy()
        fout = open(file_name, 'w')
        fout.write('%d %d\n' % (len(id2word), self.emb_dimension))
        for wid, w in id2word.items():
            e = embedding[wid]
            e = ' '.join(map(lambda x: str(x), e))
            fout.write('%s %s\n' % (w, e))
Пример #9
0
class Word2Vec:
    def __init__(self, log_filename: str,
                 output_filename: str,
                 embedding_dimension: int=100,
                 batch_size: int=128,
                 iteration: int=1,
                 initial_lr: float=0.025,
                 min_count: int=5,
                 sub_sampling_t: float = 1e-5,
                 neg_sampling_t: float = 0.75,
                 neg_sample_count: int = 5,
                 half_window_size: int = 2,
                 read_data_method: str='memory'):
        """
        init func
        """
        self.data = DataHanlder(log_filename=log_filename,
                                batch_size=batch_size,
                                min_count=min_count,
                                sub_sampling_t=sub_sampling_t,
                                neg_sampling_t=neg_sampling_t,
                                neg_sample_count=neg_sample_count,
                                half_window_size=half_window_size,
                                read_data_method=read_data_method)
        self.output_filename = output_filename
        self.embedding_dimension = embedding_dimension
        self.batch_size = batch_size
        self.half_window_size = half_window_size
        self.iter = iteration
        self.initial_lr = initial_lr
        self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.sg_model.cuda()
        self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr)

    def train(self):
        i = 0
        # total 2 * self.half_window_size * self.data.total_word_count,
        # for each sent, (1 + 2 + .. + half_window_size) * 2 more pairs has been calculated, over all * sent_len
        # CAUTION: IT IS NOT AN ACCURATE NUMBER, JUST APPROXIMATELY COUNT.
        approx_pair = 2 * self.half_window_size * self.data.total_word_count - \
                      (1 + self.half_window_size) * self.half_window_size * self.data.sentence_len
        batch_count = self.iter * approx_pair / self.batch_size
        for pos_u, pos_v, neg_samples in self.data.gen_batch():
            i += 1
            if self.data.sentence_cursor > self.data.sentence_len * self.iter:
                # reach max iter
                break
            # train iter
            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_samples))
            if self.use_cuda:
                pos_u, pos_v, neg_v = [i.cuda() for i in (pos_u, pos_v, neg_v)]

            # print(len(pos_u), len(pos_v), len(neg_v))
            self.optimizer.zero_grad()
            # 안에서 로스값이 바로 튀어나옴
            loss = self.sg_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i % 100 == 0:
                # print(loss)
                print("step: %d, Loss: %0.8f, lr: %0.6f" % (i, loss.item(), self.optimizer.param_groups[0]['lr']))
            if i % (100000 // self.batch_size) == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr

        self.sg_model.save_embedding(self.data.id2word, self.output_filename, self.use_cuda)
Пример #10
0
class DeepwalkTrainer:
    def __init__(self, args):
        """ Initializing the trainer with the input arguments """
        self.args = args
        self.dataset = DeepwalkDataset(
            net_file=args.net_file,
            map_file=args.map_file,
            walk_length=args.walk_length,
            window_size=args.window_size,
            num_walks=args.num_walks,
            batch_size=args.batch_size,
            negative=args.negative,
            num_procs=args.num_procs,
            fast_neg=args.fast_neg,
            )
        self.emb_size = len(self.dataset.net)
        self.emb_model = None

    def init_device_emb(self):
        """ set the device before training 
        will be called once in fast_train_mp / fast_train
        """
        choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
        assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
        assert self.args.num_procs >= 1, "The number of process must be larger than 1"
        choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
        assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"
        
        # initializing embedding on CPU
        self.emb_model = SkipGramModel(
            emb_size=self.emb_size, 
            emb_dimension=self.args.dim,
            walk_length=self.args.walk_length,
            window_size=self.args.window_size,
            batch_size=self.args.batch_size,
            only_cpu=self.args.only_cpu,
            only_gpu=self.args.only_gpu,
            mix=self.args.mix,
            neg_weight=self.args.neg_weight,
            negative=self.args.negative,
            lr=self.args.lr,
            lap_norm=self.args.lap_norm,
            adam=self.args.adam,
            sgd=self.args.sgd,
            avg_sgd=self.args.avg_sgd,
            fast_neg=self.args.fast_neg,
            )
        
        torch.set_num_threads(self.args.num_threads)
        if self.args.only_gpu:
            print("Run in 1 GPU")
            self.emb_model.all_to_device(0)
        elif self.args.mix:
            print("Mix CPU with %d GPU" % self.args.num_procs)
            if self.args.num_procs == 1:
                self.emb_model.set_device(0)
        else:
            print("Run in %d CPU process" % self.args.num_procs)

    def train(self):
        """ train the embedding """
        if self.args.num_procs > 1:
            self.fast_train_mp()
        else:
            self.fast_train()

    def fast_train_mp(self):
        """ multi-cpu-core or mix cpu & multi-gpu """
        self.init_device_emb()
        self.emb_model.share_memory()

        start_all = time.time()
        ps = []

        np_ = self.args.num_procs
        for i in range(np_):
            p = mp.Process(target=self.fast_train_sp, args=(i,))
            ps.append(p)
            p.start()

        for p in ps:
            p.join()
        
        print("Used time: %.2fs" % (time.time()-start_all))
        self.emb_model.save_embedding(self.dataset, self.args.emb_file)

    @thread_wrapped_func
    def fast_train_sp(self, gpu_id):
        """ a subprocess for fast_train_mp """
        if self.args.mix:
            self.emb_model.set_device(gpu_id)
        torch.set_num_threads(self.args.num_threads)

        sampler = self.dataset.create_sampler(gpu_id)

        dataloader = DataLoader(
            dataset=sampler.seeds,
            batch_size=self.args.batch_size,
            collate_fn=sampler.sample,
            shuffle=False,
            drop_last=False,
            num_workers=4,
            )
        num_batches = len(dataloader)
        print("num batchs: %d in subprocess [%d]" % (num_batches, gpu_id))
        # number of positive node pairs in a sequence
        num_pos = int(2 * self.args.walk_length * self.args.window_size\
            - self.args.window_size * (self.args.window_size + 1))
        
        start = time.time()
        with torch.no_grad():
            max_i = self.args.iterations * num_batches
            
            for i, walks in enumerate(dataloader):
                # decay learning rate for SGD
                lr = self.args.lr * (max_i - i) / max_i
                if lr < 0.00001:
                    lr = 0.00001

                if self.args.fast_neg:
                    self.emb_model.fast_learn(walks, lr)
                else:
                    # do negative sampling
                    bs = len(walks)
                    neg_nodes = torch.LongTensor(
                        np.random.choice(self.dataset.neg_table, 
                            bs * num_pos * self.args.negative, 
                            replace=True))
                    self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes)

                if i > 0 and i % self.args.print_interval == 0:
                    print("Solver [%d] batch %d tt: %.2fs" % (gpu_id, i, time.time()-start))
                    start = time.time()

    def fast_train(self):
        """ fast train with dataloader """
        # the number of postive node pairs of a node sequence
        num_pos = 2 * self.args.walk_length * self.args.window_size\
            - self.args.window_size * (self.args.window_size + 1)
        num_pos = int(num_pos)

        self.init_device_emb()

        sampler = self.dataset.create_sampler(0)

        dataloader = DataLoader(
            dataset=sampler.seeds,
            batch_size=self.args.batch_size,
            collate_fn=sampler.sample,
            shuffle=False,
            drop_last=False,
            num_workers=4,
            )
        
        num_batches = len(dataloader)
        print("num batchs: %d" % num_batches)

        start_all = time.time()
        start = time.time()
        with torch.no_grad():
            max_i = self.args.iterations * num_batches
            for iteration in range(self.args.iterations):
                print("\nIteration: " + str(iteration + 1))
                
                for i, walks in enumerate(dataloader):
                    # decay learning rate for SGD
                    lr = self.args.lr * (max_i - i) / max_i
                    if lr < 0.00001:
                        lr = 0.00001

                    if self.args.fast_neg:
                        self.emb_model.fast_learn(walks, lr)
                    else:
                        # do negative sampling
                        bs = len(walks)
                        neg_nodes = torch.LongTensor(
                            np.random.choice(self.dataset.neg_table, 
                                bs * num_pos * self.args.negative, 
                                replace=True))
                        self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes)

                    if i > 0 and i % self.args.print_interval == 0:
                        print("Batch %d, training time: %.2fs" % (i, time.time()-start))
                        start = time.time()

        print("Training used time: %.2fs" % (time.time()-start_all))
        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
            using_hs: Whether using hierarchical softmax.

        Returns:
            None.
        """
        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        print("Input Data", self.data)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        print("emb_size", self.emb_size)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size)
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    # @profile
    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)

    def cbow_train(self):
        print("CBOW Training......")
        pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_cbow_batch_all_pairs(
                self.batch_size, self.context_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u)
            loss.backward()
            self.optimizer.step()
            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 antonym_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)
        self.antonym_file = open(antonym_file, 'r')

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def calculate_antonym_loss(self):

        src_ids = []
        tgt_ids = []

        while len(src_ids) < self.batch_size:
            line = self.antonym_file.readline()
            if not line:
                #EOF reached
                self.antonym_file.seek(0)
            words = line.strip('\n').split()
            if len(words) < 2:
                continue
            src = words[0]
            tgt = random.choice(words[1:]).strip('\n')

            src_id = self.data.word2id.get(src, None)
            tgt_id = self.data.word2id.get(tgt, None)

            if src_id is None or tgt_id is None:
                continue

            src_ids.append(src_id)
            tgt_ids.append(tgt_id)

        #src_embedding = self.skip_gram_model.embed(torch.LongTensor(src_id).to(self.device))
        #tgt_embedding = self.skip_gram_model.embed(torch.LongTensor(tgt_id).to(self.device))

        input_src = torch.LongTensor(src_ids).to(self.device)
        input_tgt = torch.LongTensor(tgt_ids).to(self.device)
        src_embedding = torch.squeeze(self.skip_gram_model.embed(input_src))
        tgt_embedding = torch.squeeze(self.skip_gram_model.embed(input_tgt))

        #loss = torch.abs(torch.dot(src_embedding,tgt_embedding))
        loss = torch.abs(
            torch.sum(torch.mul(src_embedding, tgt_embedding), dim=1))
        loss = loss / (torch.norm(src_embedding, dim=1) *
                       torch.norm(tgt_embedding, dim=1))
        return torch.mean(loss)

    def train(self):

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            count = 0

            for i, sample_batched in enumerate(self.dataloader):

                count += 1
                if count % 10000 == 0:
                    print("\n\nEpoch %d, %d batches processed" %
                          (iteration, count))

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()

                    skip_gram_loss = self.skip_gram_model.forward(
                        pos_u, pos_v, neg_v)
                    antonym_loss = 100 * self.calculate_antonym_loss()

                    loss = skip_gram_loss + antonym_loss

                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 50000 == 0:
                        print(" Loss: " + str(running_loss) + ' sk: ' +
                              str(skip_gram_loss.data) + ' ant: ' +
                              str(antonym_loss.data))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Пример #13
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):

        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        print("Skip_Gram Training......")
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("Skip_Gram Trained and Saving File......")
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)
        print("Skip_Gram Trained and Saved File.")

    def cbow_train(self):
        print("CBOW Training......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        pos_all_pairs = self.data.get_cbow_batch_all_pairs(
            self.batch_size, self.context_size)
        pair_count = len(pos_all_pairs)
        process_bar = tqdm(range(int(pair_count / self.batch_size)))
        for _ in process_bar:
            pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size,
                                                       self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
Пример #14
0
                neg_v = neg_v.cuda()

            loss_val = model_1(doc_u, pos, neg_v)

            # print(str(i)+'   '+str(loss_val))
            loss.append(loss_val.data.cpu().numpy())
            loss_val.backward()
            opt.step()

        if doc_id not in list(loss_g.keys()):
            loss_g[doc_id] = [np.mean(loss)]
        else:
            loss_g[doc_id].append(np.mean(loss))

    l = np.mean([loss_g[k][i] for k in list(loss_g.keys())])

    print('epoch - ' + str(i) + '\tloss - ' + str(l))

print('Completed')

iter_loss = [np.mean([loss_g[x][i] for x in list(loss_g.keys())]) for i in range(epoch)]
print(iter_loss)

with open('./' + dataset + '/loss.json', 'wb') as f:
    pickle.dump(loss_g, f)

with open('./' + dataset + '/iter_loss.json', 'wb') as f:
    pickle.dump(iter_loss, f)

model_1.save_embedding(cuda, dataset)
Пример #15
0
class Word2Vec:
    def __init__(self,
                 output_file_name,
                 output_sense_name,
                 emb_dimension=128,
                 K=5,
                 batch_size=1,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.1,
                 createClusterLambda=1.5,
                 min_count=0):
        """Initilize class parameters.
        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
        Returns:
            None.
        """
        self.data = InputData(min_count)
        self.output_file_name = output_file_name
        self.output_sense_name = output_sense_name
        self.emb_size = len(self.data.node2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.K = K
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.createClusterLambda = createClusterLambda
        self.skip_gram_model = SkipGramModel(self.emb_size, self.K,
                                             self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        total_pos_pairs = self.data.get_node_pairs(self.window_size)
        print("training\n")
        for t in process_bar:
            pos_pairs = total_pos_pairs[t]
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            # right=[]
            cnt = 0
            curword = pos_u[cnt]
            contextwords = []
            contextwords_cuda = []
            while cnt < len(pos_u):
                contextwords.append(pos_v[cnt])
                contextwords_cuda.append(pos_v[cnt])
                cnt += 1
            contextembedding = torch.zeros(self.emb_dimension)
            contextwords_cuda = Variable(torch.LongTensor(contextwords_cuda))
            if self.use_cuda:
                contextwords_cuda = contextwords_cuda.cuda()
            emb_v = self.skip_gram_model.v_embeddings(contextwords_cuda)
            if self.use_cuda:
                emb_v_data = emb_v.cpu().data
            else:
                emb_v_data = emb_v.data
            for i in range(len(contextwords)):
                contextembedding += emb_v_data[i]
                # torch.add(contextembedding,emb_v_data[i,:],out=emb_v_data_total)
            emb_v_data_avg = contextembedding / (len(contextwords))
            # torch.div(emb_v_data_total,len(contextwords),out=emb_v_data_avg)
            minDist = np.inf
            rightsense = 0
            mu = torch.Tensor(self.emb_dimension)
            if self.skip_gram_model.num_sense[curword] == self.K:
                nC = self.K
            else:
                nC = self.skip_gram_model.num_sense[curword] + 1
            prob = torch.Tensor(nC)
            for k in range(self.skip_gram_model.num_sense[curword]):
                torch.div(self.skip_gram_model.clusterCenter[curword, k, :],
                          self.skip_gram_model.clusterCount[curword][k],
                          out=mu)
                x_norm = torch.norm(emb_v_data_avg, p=2)
                y_norm = torch.norm(mu, p=2)
                summ = 0
                for p in range(self.emb_dimension):
                    summ += emb_v_data_avg[p] * mu[p]
                dist = 1 - summ / (x_norm * y_norm)
                prob[k] = dist
                if dist < minDist:
                    minDist = dist
                    rightsense = k
            if self.skip_gram_model.num_sense[curword] < self.K:
                if self.createClusterLambda < minDist:
                    prob[self.skip_gram_model.
                         num_sense[curword]] = self.createClusterLambda
                    rightsense = self.skip_gram_model.num_sense[curword]
                    self.skip_gram_model.num_sense[curword] += 1
            for i in range(self.emb_dimension):
                self.skip_gram_model.clusterCenter[curword][rightsense][
                    i] += emb_v_data_avg[i]
            self.skip_gram_model.clusterCount[curword][rightsense] += 1
            # for i in range(len(contextwords)):
            #    right.append(rightsense)

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v,
                                                rightsense, self.use_cuda)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if t * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * t / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2node,
                                            self.output_file_name,
                                            self.output_sense_name,
                                            self.use_cuda)
Пример #16
0
class Word2Vec(object):
    def __init__(self,output_file_name,
            walks = [],
            emb_dimension=100,
            batch_size=64,
            window_size=5,
            epochs=5,
            negative_num=5):
        print("Load data...")
        self.data = InputData(window_size, batch_size, walks)
        self.output_file_name = output_file_name
        self.emb_dimension = emb_dimension
        self.epochs = epochs
        self.negative_num = negative_num
        self.batch_size = batch_size
        self.vocab_size = self.data.vocab_size
        self.model = SkipGramModel(self.vocab_size, self.emb_dimension)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1.0)

        if cuda_gpu:
            self.model = self.model.cuda()

    def train_model(self):
        for _ in tqdm(range(self.epochs)):
            step = 0
            avg_loss = 0
            for pos_pairs in self.data.data_iter:

                target_word = pos_pairs[0][:,0]
                context_word = pos_pairs[0][:,1]
                neg_word = self.data.get_negative_sample(pos_pairs[0], 3)

                if cuda_gpu:
                    target_word = torch.tensor(target_word, dtype=torch.long).cuda()
                    context_word= torch.tensor(context_word, dtype=torch.long).cuda()
                    neg_word = torch.tensor(neg_word, dtype=torch.long).cuda()
                    loss = self.model(target_word, context_word, neg_word).cuda()

                else:
                    target_word = torch.tensor(target_word, dtype=torch.long)
                    context_word= torch.tensor(context_word, dtype=torch.long)
                    neg_word = torch.tensor(neg_word, dtype=torch.long)

                    loss = self.model(target_word, context_word, neg_word)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                if cuda_gpu:
                    avg_loss += loss.cpu().item()
                else:
                    # print(loss.item())
                    avg_loss += loss.item()
                step += 1
                if step % 2000 == 0 and step > 0:
                    avg_loss /= 2000
                    print("Average loss at step ", step, ": ", avg_loss)
                    avg_loss = 0

        self.model.save_embedding(self.output_file_name)
        print("~ done.")
Пример #17
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=1):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name,
                                            self.use_cuda)
Пример #18
0
class Word2VecTrainer:
    def __init__(self, args):# input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3,initial_lr=0.01, min_count=25,weight_decay = 0, time_scale =1

        # self.data = DataReader(args.text, args.min_count)
        # if not args.use_time:
        #      dataset = Word2vecDataset(self.data, args.window_size)
        # else:
        #     dataset = TimestampledWord2vecDataset(self.data, args.window_size,args.time_scale)
        #
        # self.dataloader = DataLoader(dataset, batch_size=args.batch_size,
        #                              shuffle=True, num_workers=0, collate_fn=dataset.collate)
        self.data,self.dataloader = self.load_train(args) # self.data

        if "train" in args.text:
            test_filename = args.text.replace("train","test")
            if  os.path.exists(test_filename):
                print("load test  dataset: ".format(test_filename))
                self.test = self.load_train(args, data = self.data, filename=test_filename, is_train=False )
            else:
                self.test = None

            dev_filename = args.text.replace("train", "dev")
            if  os.path.exists(dev_filename):
                print("load dev dataset: ".format(dev_filename))
                self.dev = self.load_train(args, data = self.data, filename=dev_filename, is_train=False)
            else:
                self.dev = None
        else:
            self.dev, self.test = None, None

        
        if args.use_time:
            self.output_file_name = "{}/{}".format(args.output, args.time_type)
            if args.add_phase_shift:
                self.output_file_name  += "_shift"
        else:
            self.output_file_name = "{}/{}".format(args.output, "word2vec")
        if not os.path.exists(args.output):
            os.mkdir(args.output)
        if not os.path.exists(self.output_file_name):
            os.mkdir(self.output_file_name)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.emb_dimension
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.lr = args.lr
        self.time_type = args.time_type
        self.weight_decay = args.weight_decay

        print(args)


        if args.use_time:
            self.skip_gram_model = TimestampedSkipGramModel(self.emb_size, self.emb_dimension,time_type = args.time_type,add_phase_shift=args.add_phase_shift) 
        else:
            self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("using cuda and GPU ....")
            self.skip_gram_model.cuda()

        # load_path = "{}/{}".format(self.output_file_name)
        # torch.save(self.skip_gram_model,"pytorch.bin")
        # self.skip_gram_model =  torch.load("pytorch.bin")
        # self.skip_gram_model = load_model(self.skip_gram_model,"pytorch.bin")
        # exit()
        if not args.from_scatch and os.path.exists(self.output_file_name):

            print("loading parameters  ....")
            self.skip_gram_model.load_embeddings(self.data.id2word,self.output_file_name)

    def load_train(self,args,data= None, filename = None, is_train = True):
        if data is None:
            assert is_train==True, "wrong to load data 1"
            data = DataReader(args.text, args.min_count)
            filename = args.text
        else:
            assert is_train == False, "wrong to load test data 2"
            assert filename is not None, "wrong to load test data 3"
            assert data is not None, "wrong to load test data 4"
        if not args.use_time:
            dataset = Word2vecDataset(data, input_text = filename, window_size= args.window_size)
        else:
            dataset = TimestampledWord2vecDataset(data,input_text = filename, window_size= args.window_size, time_scale=args.time_scale)

        dataloader = DataLoader(dataset, batch_size=args.batch_size,
                                     shuffle=is_train, num_workers=0, collate_fn=dataset.collate) # shuffle if it is train
        if is_train:
            return data,dataloader
        else:
            return dataloader

    def evaluation_loss(self,logger =None):
        results = []
        self.skip_gram_model.eval()
        print("evaluating ...")
        for index,dataloader in enumerate([self.dev,self.test]):
            if dataloader is None:
                continue
            losses = []
            for i, sample_batched in enumerate(tqdm(dataloader)):
                if len(sample_batched[0]) > 1:

                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    if args.use_time:
                        time = sample_batched[3].to(self.device)
                        # print(time)
                        loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v, time)
                    else:

                        loss, pos, neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    # print(loss)
                    losses.append(loss.item())
            mean_result = np.array(losses).mean()
            results.append(mean_result)
            print("test{} loss is {}".format(index, mean_result))
            logger.write("Loss in  test{}: {} \n".format( index, str(mean_result)))
            logger.flush()

        self.skip_gram_model.train()
        return results

    def train(self):
        print(os.path.join(self.output_file_name,"log.txt"))
        if not os.path.exists(self.output_file_name):
            os.mkdir(self.output_file_name)
        optimizer = optim.Adam(self.skip_gram_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)*self.iterations)


        with open("{}/log.txt".format(self.output_file_name,"log.txt"),"w") as f:
            for iteration in range(self.iterations):

                print("\nIteration: " + str(iteration + 1))
                f.write(str(args) +"\n")
                # optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr)


                running_loss = 0.0
                for i, sample_batched in enumerate(tqdm(self.dataloader)):
                    if len(sample_batched[0]) > 1:

                        pos_u = sample_batched[0].to(self.device)
                        pos_v = sample_batched[1].to(self.device)
                        neg_v = sample_batched[2].to(self.device)

                        optimizer.zero_grad()
                        if args.use_time:
                            time = sample_batched[3].to(self.device)
                            # print(time)
                            loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v,time)
                        else:

                            loss,pos,neg = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                        # print(loss)

                        loss.backward()
                        optimizer.step()
                        scheduler.step()



                        loss,pos,neg = loss.item(),pos.item(),neg.item()

                        if  i % args.log_step == 0: # i > 0 and
                            f.write("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg)))

                        if  not torch.cuda.is_available() or i % (args.log_step*10) == 0 :
                            print("Loss in {} steps: {} {}, {}\n".format(i,str(loss),str(pos),str(neg)))
                self.evaluation_loss(logger=f)
                epoch_path = os.path.join(self.output_file_name,str(iteration))
                if not os.path.exists(epoch_path):
                    os.mkdir(epoch_path)

                torch.save(self.skip_gram_model, os.path.join( epoch_path,"pytorch.bin") )

                self.skip_gram_model.save_embedding(self.data.id2word, os.path.join(self.output_file_name,str(iteration)))
                self.skip_gram_model.save_in_text_format(self.data.id2word,
                                                         os.path.join(self.output_file_name, str(iteration)))
            self.skip_gram_model.save_in_text_format(self.data.id2word,self.output_file_name)


            torch.save(self.skip_gram_model, os.path.join(self.output_file_name,"pytorch.bin") )
            with open(os.path.join(self.output_file_name,"config.json"), "wt") as f:
                json.dump(vars(args), f, indent=4)
            self.skip_gram_model.save_dict(self.data.id2word,self.output_file_name)
Пример #19
0
class Node2Vec:
    def __init__(self, args, graph):
        print("\nPerforming Node2vec...\n")
        # 1. generate walker
        walker = DeepWalker(args, graph)
        print("\nDoing deepwalks...\n")
        walker.create_features()

        self.inputFileName = "{}{}-deepwalk_{}-num_walks_{}-len_metapath.txt".format(
            args.input_path, args.idx_metapath, args.number_of_walks,
            args.walk_length)

        # 2. read data
        self.data = DataReader(args.min_count, args.care_type,
                               self.inputFileName)

        # 3. make dataset for training
        dataset = DatasetLoader(self.data, args.window_size)

        # 4. initialize dataloader
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)

        self.output_file_name = "{}{}-embedding_{}-deepwalk_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count.pickle".format(
            args.output_path, args.idx_embed, args.idx_metapath, args.dim,
            args.initial_lr, args.window_size, args.iterations, args.min_count)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()
                    running_loss = running_loss * 0.9 + loss.item() * 0.1
            print(" Loss: " + str(running_loss))
            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Пример #20
0
class Word2Vec:
    """ Word2Vec class module for extracting triples and training.
    
    """
    def __init__(self, ifolder, ofolder, 
                 emb_dimension=400,
                 batch_size=32,
                 iteration=int(sys.argv[3]),
                 initial_lr=0.025):
        
        self.ifolder = ifolder
        
        self.outfolder = ofolder+ifolder.rsplit('/',2)[1]+'/'
        try:
            os.makedirs(self.outfolder)
        except:
            print(self.outfolder+ " folder exists. Will be overwritten")
        
        self.emb_dimension = emb_dimension
        self.initial_lr = initial_lr
        self.iteration = iteration
        self.batch_size = batch_size
        self.fpos = 0
        self.fneg = 0
        
        self.id2word = dict()
        self.id2pair = dict()
        self.pair2id = dict()
        
        self.read_word_dict(ifolder+"Word2Id")
        self.read_pair_dict(ifolder+"Pair2Id")
        
        self.pair_count = self.evaluate_pair_count()
        self.positive_pairs = np.zeros((self.pair_count, 2), dtype=int)
       
        # Dummy values to ensure size does not change
        self.negative_pairs = np.zeros((self.pair_count, 5), dtype=int)
        
        print(" Size of :", sys.getsizeof(self.positive_pairs))
        print(" Size of :", sys.getsizeof(self.negative_pairs))
        #ipdb.set_trace()
        
        self.emb_size     = len(self.id2word)
        self.pair_emb_size = len(self.id2pair)
        
        
        self.skip_gram_model = SkipGramModel(self.pair_emb_size,self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
        
        print("Start reading pairs")
        
    def read_word_dict(self, wdictfile ):
        
        with open(wdictfile) as inputFile:
            
            for item in inputFile:
                word,wid = item.split()
                self.id2word[int(wid)] = word
                          
        print("\n Completed reading word dictionary.")
    def read_pair_dict(self, pdictfile ):
        
        with open(pdictfile) as inputFile:
            
            for item in inputFile:
                word1,word2,pid = item.split()
               
                self.id2pair[int(pid)] = word1+':::'+word2
                self.pair2id[(word1,word2)] = int(pid)
                #print(self.id2pair[int(pid)],word1+':::'+word2)
                
        print("\n Completed reading pair dictionary.")
        
        self.cross_verification_BLESS()
        self.cross_verification_EVAL()
        
        
    def evaluate_pair_count(self):
        
        self.datasets = dict()
        
        dsfile = self.ifolder+"Statistics"
        with open(dsfile) as inputFile:
             
            for item in inputFile:
                
                if re.match("Dataset",item):
                    i = item.split(':')[1]
                    
        print("Total positive pair samples :",i)        
        return int(i)
    
    def read_pairs(self, posFile, negFile):
        """
        Read triples from file and update self.positive_pairs & self.negative_pairs
        
        """
        posDsfile = self.ifolder+posFile
        
        index = 0
        #ipdb.set_trace()
        with open(posDsfile) as inputFile:
            
            for line in inputFile:
                
                pid, wid = line.split()
                #self.positive_pairs.append([int(pid),int(wid)])
                self.positive_pairs[index] = [int(pid),int(wid)]
                index += 1
        print("Size of :", sys.getsizeof(self.positive_pairs))
        
        negDsfile = self.ifolder+negFile
        
        index = 0
        with open(negDsfile) as inputFile:
            for line in inputFile:
                temp = [int(i) for i in line.split()]
                self.negative_pairs[index] = temp
                index += 1
        print(" Size of :", sys.getsizeof(self.negative_pairs))
        
    def get_batch_pairs(self, batch_count):
        
        return self.positive_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size]
        
        
    def get_neg_v(self, batch_count):
        
        return self.negative_pairs[(batch_count)*self.batch_size:(batch_count+1)*self.batch_size]
        
    
    def cross_verification_BLESS(self):
        """
        Optional method
        To verify how many BLESS dataset elements are mapped with model pairs 
        """
        #Remove the file if it already exists
        try:
            os.remove(self.outfolder+"BlessSet.txt")
        except:
            pass
        
        #Remove the file if it already exists
        try:
            os.remove(self.outfolder+"BlessSet_Except.txt")
        except:
            pass
        
        blessExceptFile = open(self.outfolder+"BlessSet_Except.txt","w")
        blessFile = open(self.outfolder+"BlessSet.txt","w")
        
        self.Bless_id2pair = dict()
        
        with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/BLESS_UniqueTuples") as evalFile:
            testDataset = evalFile.readlines()
            
            for items in testDataset:
                nouns = items.split()
                search_key = (nouns[0],nouns[1])
                rev_search_key = (nouns[1],nouns[0])
                
                if (search_key in self.pair2id):
                    temp_id = self.pair2id[search_key]
                    self.Bless_id2pair[temp_id] = nouns[0]+':::'+nouns[1]
                    blessFile.write(items)
                
                else:
                    blessExceptFile.write(items)                
               
        
        print("Completed cross validation with Blessset")
        blessExceptFile.close()
        blessFile.close()

    def cross_verification_EVAL(self):
        """
        Optional method
        To verify how many EVAL dataset elements are mapped with model pairs 
        """

        #Remove the file if it already exists
        try:
            os.remove(self.outfolder+"EvalSet.txt")
        except:
            pass
        
        #Remove the file if it already exists
        try:
            os.remove(self.outfolder+"EvalSet_Except.txt")
        except:
            pass
        
        EVALExceptFile = open(self.outfolder+"EvalSet_Except.txt","w")
        EVALFile = open(self.outfolder+"EvalSet.txt","w")
        
        self.Eval_id2pair = dict()
        
        with open("/home/achingacham/Model/GRID_data/Evaluation_Datasets/EVAL_UniqueTuples") as evalFile:
            testDataset = evalFile.readlines()
            
            for items in testDataset:
                nouns = items.split()
                search_key = (nouns[0],nouns[1])
                rev_search_key = (nouns[1],nouns[0])
                
                if (search_key in self.pair2id):
                    temp_id = self.pair2id[search_key]
                    self.Eval_id2pair[temp_id] = nouns[0]+':::'+nouns[1]
                    EVALFile.write(items)
                
                else:
                    EVALExceptFile.write(items)                
               
        
        print("Completed cross validation with Blessset")
        EVALExceptFile.close()
        EVALFile.close()
        

        
    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        
        batch_count = self.pair_count / self.batch_size
            
        for epoch in range(self.iteration):
            
            print("\n Epoch :", epoch)
            
            output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_All.txt"
            Bless_output_file_name = self.outfolder+"Epoch_"+str(epoch)+"_EMB_"+str(self.emb_dimension)+"_Bless.txt"
        
            epochLoss = 0
            
            process_bar = tqdm(range(int(batch_count)))
            
            for i in process_bar:
            
                pos_pairs = self.get_batch_pairs(i)
                neg_v = self.get_neg_v(i) 
                
                
                pos_u = np.array([pair[0] for pair in pos_pairs])   #index to the pair of Nouns
                pos_v = np.array([pair[1] for pair in pos_pairs])   #a context word (for instance, inbetween word)
                
                
                
                #pos_u = Variable(torch.LongTensor(pos_u))
                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v)) #a negative context word from unigram distribution
                
                      
                if self.use_cuda:
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                
                
                self.optimizer.zero_grad()
                loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                
                loss.backward()
                self.optimizer.step()

                process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                            (loss.data[0],self.optimizer.param_groups[0]['lr']))
                
                epochLoss += loss.data[0]
                
                if i * self.batch_size % 100000 == 0:
                    lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr

            print("\n Average Epoch Loss: ", epochLoss/batch_count)
            
            self.skip_gram_model.save_embedding(self.id2pair, output_file_name, self.use_cuda)    
Пример #21
0
class Metapath2Vec:
    def __init__(self, args, graph):
        # 1. generate walker
        walker = MetaPathWalker(args, graph)

        files = os.listdir(args.input_path)
        is_file = False
        for file in files:
            fullFilename = os.path.join(args.input_path, file)
            # if file exists, load the file.
            if file.startswith(args.idx_metapath):
                is_file = True
                print("\n !!! Found the file that you have specified...")
                self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format(
                    args.input_path, args.idx_metapath, args.which_metapath,
                    args.num_walks, args.len_metapath)
                print("### Metapaths Loaded...", self.inputFileName)

        # if file does not exists, create the new one.
        if not is_file:
            print("\n !!! There is no metapaths with the given parameters...")
            print("### Creating new Metapaths...")
            self.metapaths = walker.generate_metapaths(args)
            walker.create_metapath_walks(args, args.num_walks, self.metapaths)
            self.inputFileName = "{}{}-metapath_{}-whichmeta_{}-num_walks_{}-len_metapath.txt".format(
                args.input_path, args.idx_metapath, args.which_metapath,
                args.num_walks, args.len_metapath)
            print("### Metapaths Loaded...", self.inputFileName)

        # 2. read data
        print(
            "\n\n##########################################################################"
        )
        print("### Metapaths to DataLoader...", self.inputFileName)
        self.data = DataReader(args.min_count, args.care_type,
                               self.inputFileName)

        # 3. make dataset for training
        dataset = DatasetLoader(self.data, args.window_size)

        # 4. initialize dataloader
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)
        self.output_file_name = "{}{}-embedding_{}-metapath_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count-_{}-isCSP_{}-CSPcoef.pickle".format(
            args.output_path, args.idx_embed, args.idx_metapath, args.dim,
            args.initial_lr, args.window_size, args.iterations, args.min_count,
            args.CSP_train, args.CSP_coef)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.aux_mode = args.CSP_train
        self.aux_coef = args.CSP_coef

        if args.CSP_train:
            print("\n\n#####################################")
            print("### SkipGram with CSP")
            self.skip_gram_model = SkipGramModelAux(self.emb_size,
                                                    self.emb_dimension,
                                                    nodes=self.data.id2word,
                                                    aux_coef=self.aux_coef,
                                                    CSP_save=args.CSP_save)
        else:
            print("\n\n#####################################")
            print("### SkipGram Normal")
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        for iteration in range(self.iterations):
            #print(self.skip_gram_model.u_embeddings.weight.data)
            print("\n\n\nIteration: " + str(iteration + 1))
            # Temporary Fix!
            if self.aux_mode:
                u = self.skip_gram_model.u_embeddings.weight
                v = self.skip_gram_model.v_embeddings.weight
                e = self.skip_gram_model.encoder.weight
                optimizer = optim.Adam([u, v], lr=self.initial_lr)
                aux_optimizer = optim.Adam([e], lr=0.001)
                aux_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                    aux_optimizer, len(self.dataloader))
            else:
                optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                             lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):
                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    if self.aux_mode:
                        aux_scheduler.step()
                        aux_optimizer.zero_grad()

                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()
                    if self.aux_mode:
                        aux_optimizer.step()
                    running_loss = running_loss * 0.9 + loss.item() * 0.1

                    #if i > 0 and i % int(len(self.dataloader)/3) == 0:
            print(" Loss: " + str(running_loss))
            if self.aux_mode:
                print(" Auxiliary Loss: " +
                      str(self.skip_gram_model.aux_loss.item()))

        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)
Пример #22
0
class Word2VecTrainer:
    def __init__(self,
                 inFile,
                 outFile,
                 prFile=None,
                 emb_dimensions=100,
                 batch_size=512,
                 window_size=5,
                 iterations=50,
                 initial_lr=0.003):

        self.data = DataReader(inFile, txtFile=prFile)
        dataset = Word2VecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = outFile
        self.emb_size = len(self.data.word2id)
        self.batch_size = batch_size
        self.emb_dimensions = emb_dimensions
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size,
                                             self.emb_dimensions)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if self.use_cuda else 'cpu')

        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):

        loss_history = []
        spear_history = []
        best_spearman = 0.0

        for itr in range(self.iterations):
            print("\nIteration: " + str(itr + 1))
            optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                  lr=self.initial_lr)

            running_loss = 0.0
            for i, batch in enumerate(tqdm(self.dataloader)):

                #                 print("V Vector:", batch[0])
                #                 print("U Mat:", batch[1])
                #                 print("Neg Sample:", batch[2])

                pos_v = batch[0].to(self.device)
                pos_u = batch[1].to(self.device)
                neg_u = batch[2].to(self.device)

                optimizer.zero_grad()
                loss = self.skip_gram_model.forward(pos_v, pos_u, neg_u)

                loss.backward()

                optimizer.step()

                running_loss = running_loss * 0.9 + loss.item() * 0.1

            print("Loss: " + str(running_loss))
            loss_history.append(running_loss)

            new_spearman = self.test(inFile="wordsim353/combined.csv")
            spear_history.append(new_spearman)

            if new_spearman > best_spearman:
                self.skip_gram_model.save_embedding(self.data.id2word,
                                                    self.output_file_name)
                best_spearman = new_spearman

        return loss_history, spear_history

    def test(self, inFile, embFile="emb_art_10.npy"):

        self.cos_dict = dict()
        self.cos_dict_id = dict()

        # 1. Import wordsim353 and visualize it
        csv = pd.read_csv(inFile)
        csv = np.array(csv)

        idsim = dict()
        wordsim = dict()

        for (word_a, word_b, num) in csv:
            if word_a in self.data.word2id and word_b in self.data.word2id:
                idsim[(self.data.word2id[word_a],
                       self.data.word2id[word_b])] = num
                wordsim[(word_a, word_b)] = num

        # 2. Load embeddings & normalize them
        if not self.skip_gram_model.v_embeddings:
            self.embeddings = np.load(embFile, allow_pickle=True)
        else:
            self.embeddings = self.skip_gram_model.v_embeddings.weight.cpu(
            ).data.numpy()

        # 3. Compute Cosine Similarities
        for (id_a, id_b), value in idsim.items():

            embeddings_a = self.embeddings[id_a].reshape(1, -1)
            embeddings_b = self.embeddings[id_b].reshape(1, -1)

            similarity = np.asscalar(
                cosine_similarity(embeddings_a, embeddings_b)[0])

            self.cos_dict[(self.data.id2word[id_a],
                           self.data.id2word[id_b])] = similarity
            self.cos_dict_id[id_a, id_b] = similarity

        # Array form
        a = list([])
        b = list([])
        for (id_a, id_b), value in idsim.items():
            a.append(value)
            b.append(self.cos_dict_id[(id_a, id_b)])

        print("Spearman Coefficient:",
              spearman_correlation(self.cos_dict_id, idsim))
        spear = spearmanr(a, b)

        print(spear)

        return (spear[0])