예제 #1
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 vocab: 'Word2VecVocab',
                 neg_sample=20,
                 padding_idx=0,
                 neg_weight=True):
        """

        :param word2vec:
        :param neg_sample: the number of negative sampling (5~20 for small datasets, 2~5 for large datasets)
        """
        super(SGNSModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.ivectors = torch.nn.Embedding(self.vocab_size,
                                           self.embedding_size,
                                           padding_idx=padding_idx)
        self.ovectors = torch.nn.Embedding(self.vocab_size,
                                           self.embedding_size,
                                           padding_idx=padding_idx)
        self.ivectors.weight = torch.nn.Parameter(
            torch.cat([
                torch.zeros(1, self.embedding_size),
                torch.FloatTensor(self.vocab_size - 1,
                                  self.embedding_size).uniform_(
                                      -0.5 / self.embedding_size,
                                      0.5 / self.embedding_size)
            ]))
        self.ovectors.weight = torch.nn.Parameter(
            torch.cat([
                torch.zeros(1, self.embedding_size),
                torch.FloatTensor(self.vocab_size - 1,
                                  self.embedding_size).uniform_(
                                      -0.5 / self.embedding_size,
                                      0.5 / self.embedding_size)
            ]))
        self.ivectors.weight.requires_grad = True
        self.ovectors.weight.requires_grad = True

        self.vocab_size = len(vocab)
        self.neg_sample = neg_sample
        if (neg_weight and neg_sample > 0) and (vocab is not None and
                                                vocab.idx2freq is not None):
            self.ns_weights = numpy.power(vocab.idx2freq, 0.75)
            self.ns_weights = torch.FloatTensor(self.ns_weights /
                                                vocab.idx2freq.sum())
        else:
            self.ns_weights = None
        log.info(
            f'SGNSModel(vocab_size: {NumUtil.comma_str(self.vocab_size)}, embedding_size: {embedding_size}, neg_sample: {self.neg_sample})'
        )
예제 #2
0
    def build(cls,
              text_file: str,
              vocab_size=int(1e5),
              token=TOKEN,
              min_count=2,
              data_dir=WORD2VEC_DATA_DIR) -> 'Word2VecVocab':
        log.info(f"building vocab... {text_file}")
        if data_dir is None:
            data_dir = os.path.dirname(text_file)
        filepath = cls.get_filepath(data_dir, text_file, vocab_size)
        log.info(filepath)

        total_lines = FileUtil.count_lines(text_file)
        word2cnt = {}
        if text_file.endswith('.gz') or text_file.endswith('zip'):
            f = gzip.open(text_file, 'r')
        else:
            f = codecs.open(text_file, 'r', encoding='utf-8')
        with f:
            for no, line in enumerate(f):
                if no % 10000 == 0:
                    log.info(
                        f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed."
                    )
                line = line.strip()
                if len(line) == 0:
                    continue
                sent = line.split()
                for word in sent:
                    word2cnt[word] = word2cnt.get(word, 0) + 1

        for word, cnt in word2cnt.copy().items():
            if cnt < min_count:
                del word2cnt[word]

        log.info(f'total unique words: {NumUtil.comma_str(len(word2cnt) + 1)}')
        idx2word = sorted(word2cnt, key=word2cnt.get, reverse=True)
        idx2word = [cls.UNK_CHAR] + idx2word[:vocab_size - 1]
        word2cnt[cls.UNK_CHAR] = 1
        idx2freq = numpy.array([word2cnt[word] for word in idx2word])
        idx2freq = idx2freq / idx2freq.sum()

        vocab = Word2VecVocab(token=token,
                              min_count=min_count,
                              idx2word=idx2word,
                              idx2freq=idx2freq)
        vocab.save(filepath=filepath)
        log.info(f"build vocab OK. {filepath}")
        return vocab
예제 #3
0
    def build(cls,
              text_file: str,
              vocab: Word2VecVocab,
              window=5,
              side='both',
              data_dir=None) -> 'Word2VecCorpus':
        log.info(f"build corpus... {text_file}")
        if data_dir is None:
            data_dir = os.path.dirname(text_file)
        filepath = cls.get_filepath(data_dir=data_dir,
                                    vocab=vocab,
                                    window=window,
                                    side=side)

        if os.path.exists(filepath):
            log.info(f"corpus file exists. load {filepath}")
            return Word2VecCorpus.load(filepath)

        total_lines = FileUtil.count_lines(text_file)
        word2idx = {
            vocab.idx2word[idx]: idx
            for idx, _ in enumerate(vocab.idx2word)
        }
        data = []
        if text_file.endswith('.gz') or text_file.endswith('zip'):
            f = gzip.open(text_file, 'r')
        else:
            f = codecs.open(text_file, 'r', encoding='utf-8')
        with f:
            for no, line in enumerate(f):
                if no % 100000 == 0:
                    log.info(
                        f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed."
                    )
                line = line.strip()
                if len(line) == 0:
                    continue
                sent = []
                for word in line.split():
                    if word in word2idx.keys():
                        sent.append(word)
                    else:
                        sent.append(Word2VecVocab.UNK_CHAR)
                for i in range(len(sent)):
                    iword, owords = cls.skipgram(sent,
                                                 i,
                                                 window=window,
                                                 side=side)
                    data.append((word2idx[iword],
                                 [word2idx[oword] for oword in owords]))

        corpus = Word2VecCorpus(data=data,
                                vocab=vocab,
                                window=window,
                                side=side)
        corpus.save(filepath=filepath)
        log.info(f"build corpus OK. {filepath}")
        return corpus
예제 #4
0
    def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding,
              args: argparse.Namespace) -> str:
        batches_in_epoch = int(numpy.ceil(
            len(self.dataloader.dataset) / batch))
        total_batches = batches_in_epoch * iterations
        nth_total_batch = 0
        log.info(f'batches_in_epoch: {batches_in_epoch}')
        log.info(f'total_batches: {total_batches}')

        watch = WatchUtil(auto_stop=False)
        watch.start()
        best_loss = float("inf")
        first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1
        last_embedding_file = None

        log.info(Word2VecEmbedding.get_filenpath(args))
        for self.epoch in range(first_epoch, last_epoch):
            log.info(f"[e{self.epoch:2d}] {self}")
            loss_list = []
            for nth, (iword, owords) in enumerate(self.dataloader, 1):
                try:
                    loss = self.sgns(iword, owords)
                except RuntimeError:
                    loss_list = [float('-inf')]
                    break

                self.optim.zero_grad()
                loss.backward()
                self.optim.step()
                # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch:  # TODO: TEST
                #     self.scheduler.step()

                if self.learning_decay != 0:
                    PytorchUtil.set_learning_rate(self.optim,
                                                  self.epoch,
                                                  gamma=self.learning_decay,
                                                  base_lr=self.init_lr,
                                                  min_lr=1e-10,
                                                  decay_start=2,
                                                  decay_interval=3)

                lr = PytorchUtil.get_learning_rate(self.optim)

                _, negatives = owords.size()
                real_loss = loss.data[0] / float(negatives)

                loss_list.append(real_loss)

                nth_total_batch += 1
                progressed = nth_total_batch / total_batches
                seconds_per_batch = float(
                    watch.elapsed()) / float(nth_total_batch)
                remain_batches = total_batches - nth_total_batch
                remain_secs = int(seconds_per_batch * remain_batches)

                if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0:
                    log.info(
                        f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}"
                    )

            total_loss = numpy.mean(loss_list)
            log.info(
                f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}"
            )
            if total_loss > best_loss or total_loss == float(
                    'inf') or total_loss == float(
                        '-inf'):  # bad loss than before or diverge
                log.info('')
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK"
                )
                log.info('')
                break
            else:
                if best_loss < total_loss:
                    best_loss = total_loss
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..."
                )
                args.epoch = self.epoch
                last_embedding_file = embedding.save(
                    idx2vec=trainer.embedding,
                    filepath=Word2VecEmbedding.get_filenpath(args))
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}"
                )
        return last_embedding_file
예제 #5
0
    parser.add_argument('--subsample',
                        default=Word2VecEmbedding.SUBSAMPLE,
                        type=float,
                        help="subsample threshold (default: 1e-5)")

    parser.add_argument('--learning_rate',
                        default=Word2VecEmbedding.LEARNING_RATE,
                        type=float,
                        help="learning rate for AdamOptimizer")
    parser.add_argument('--learning_decay',
                        default=Word2VecEmbedding.LEARNING_DECAY,
                        type=float,
                        help="exponential decay gamma (default: 0.0=no decay)")
    args = parser.parse_args()
    log.info(args)

    watch = WatchUtil(auto_stop=True)

    try:
        log.info(f'load {args.corpus_file} ...')
        watch.start()
        corpus = Word2VecCorpus.load(filepath=args.corpus_file)
        log.info(
            f'load {args.corpus_file} OK. (elapsed: {watch.elapsed_string()})')
        log.info(corpus.vocab)

        if len(corpus.vocab) > 1e5:  # out of memory (11GB GPU memory)
            args.device_no = None

        log.info('')
예제 #6
0
    parser.add_argument('--vocab_size',
                        default=Word2VecVocab.MAX_VOCAB,
                        type=int,
                        help="maximum number of vocab (default:1e5)")
    parser.add_argument(
        '--token',
        default=Word2VecVocab.TOKEN,
        choices=['word', 'morph', 'character', 'jaso'],
        help="token is word or morph or character (default: 'word')")
    parser.add_argument('--min_count',
                        default=Word2VecVocab.MIN_COUNT,
                        type=int)
    args = parser.parse_args()

    try:
        if not os.path.exists(args.text_file):
            log.error(f'text file does not exists. {args.text_file}')
            exit(-1)

        vocab = Word2VecVocab.build(text_file=args.text_file,
                                    vocab_size=args.vocab_size,
                                    token=args.token,
                                    min_count=args.min_count,
                                    data_dir=args.data_dir)
        log.info(f'vocab: {vocab.filepath} {NumUtil.comma_str(len(vocab))}')
        log.info(f'vocab.idx2word: {vocab.idx2word[:10]}')
        log.info(f'vocab.idx2freq: {vocab.idx2freq[:10]}')
    except:
        log.error(traceback.format_exc())
예제 #7
0
    @property
    def data2text(self):
        for iword, owords in self.data:
            yield self.vocab.idx2word[iword], [self.vocab.idx2word[o] for o in owords]


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--text_file', default=WIKIPEDIA_SENTENCE_FILE, type=str, help="corpus file path")
    parser.add_argument('--data_dir', default=WORD2VEC_DATA_DIR, type=str, help="data directory path (default:'./data')")

    parser.add_argument('--vocab_file', default=Word2VecVocab.DEFAULT_FILE, type=str)
    parser.add_argument('--window', default=Word2VecCorpus.WINDOW, type=int, help="window size")
    parser.add_argument('--side', default=Word2VecCorpus.SIDE, type=str, choices=['both', 'front', 'back'], help="target words in front or back or both (default: both)")
    args = parser.parse_args()
    try:
        log.info(f'vocab_file {args.vocab_file}')

        if not os.path.exists(args.vocab_file):
            log.error(f'vocab file does not exists. {args.vocab_file}')

        vocab = Word2VecVocab.load(args.vocab_file)
        log.info(vocab)
        for args.window in [args.window]:  # [1, 2, 3, 4, 5]:
            for args.side in [args.side]:  # ['both', 'front', 'back']:
                log.info(f'window: {args.window} side: {args.side}')
                corpus = Word2VecCorpus.build(text_file=args.text_file, vocab=vocab, window=args.window, side=args.side, data_dir=args.data_dir)
                log.info(f'corpus: {corpus.filepath} {NumUtil.comma_str(len(corpus))}')
    except:
        log.error(traceback.format_exc())
예제 #8
0
def word2vec_tensorboard(embedding_file_list,
                         top_n=1e5,
                         output_dir=TENSORBOARD_LOG_DIR):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    for filename in os.listdir(output_dir):
        os.remove(os.path.join(output_dir,
                               filename))  # remove old tensorboard files

    config = projector.ProjectorConfig()
    embedding_list = []
    for embedding_file in embedding_file_list:
        if not os.path.exists(embedding_file):
            log.info(f'{embedding_file} not exists. skipped.')
            continue

        embedding = Word2VecEmbedding.load(embedding_file)

        name = os.path.basename(embedding_file.replace('+', ''))
        while name.startswith('_'):
            name = name[1:]

        idx2vec = embedding.idx2vec
        idx2word, idx2freq = embedding.idx2word, embedding.idx2freq
        if top_n > 0:
            name += f'.top_n_{top_n}'
            idx2vec, idx2word, idx2freq = idx2vec[:
                                                  top_n], embedding.idx2word[:
                                                                             top_n], embedding.idx2freq[:
                                                                                                        top_n]

        embedding_var = tf.Variable(idx2vec, name=name)
        embedding_list.append(embedding_var)
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = os.path.join(output_dir, f'{name}.tsv')

        log.info('')
        log.info(f'{embedding_file} loaded.')
        log.info(
            f'embedding_var.name: {embedding_var.name} shape: {embedding_var.shape}'
        )
        log.info(f'embedding.metadata_path: {embedding.metadata_path}')
        with open(embedding.metadata_path, 'wt') as out_f:
            out_f.write('spell\tfreq\n')
            for spell, freq in zip(idx2word, idx2freq):
                out_f.write(f'{spell}\t{freq:.7f}\n')

    summary_writer = tf.summary.FileWriter(output_dir)
    projector.visualize_embeddings(summary_writer, config)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(var_list=embedding_list)
        checkpoint_file = os.path.join(output_dir, f'{name}.ckpt')
        saver.save(sess, checkpoint_file, global_step=None)
        log.info(f'checkpoint_file: {checkpoint_file}')

    # change absolute path -> relative path
    for filename in ['checkpoint', 'projector_config.pbtxt']:
        filepath = os.path.join(output_dir, filename)

        lines = []
        with open(filepath, 'rt') as f:
            for line in f.readlines():
                lines.append(line.replace(output_dir, '.'))
        os.remove(filepath)
        with open(filepath, 'wt') as f:
            for line in lines:
                f.write(line)