parser.add_argument('--vocab_size', default=Word2VecVocab.MAX_VOCAB, type=int, help="maximum number of vocab (default:1e5)") parser.add_argument( '--token', default=Word2VecVocab.TOKEN, choices=['word', 'morph', 'character', 'jaso'], help="token is word or morph or character (default: 'word')") parser.add_argument('--min_count', default=Word2VecVocab.MIN_COUNT, type=int) args = parser.parse_args() try: if not os.path.exists(args.text_file): log.error(f'text file does not exists. {args.text_file}') exit(-1) vocab = Word2VecVocab.build(text_file=args.text_file, vocab_size=args.vocab_size, token=args.token, min_count=args.min_count, data_dir=args.data_dir) log.info(f'vocab: {vocab.filepath} {NumUtil.comma_str(len(vocab))}') log.info(f'vocab.idx2word: {vocab.idx2word[:10]}') log.info(f'vocab.idx2freq: {vocab.idx2freq[:10]}') except: log.error(traceback.format_exc())
neg_weight=args.neg_weight, subsample=args.subsample, learning_rate=args.learning_rate, learning_decay=args.learning_decay) log.info(f'Word2VecTrainer() OK. (elapsed: {watch.elapsed_string()})') log.info(trainer) log.info(f'trainer.train(epoch={args.epoch}, batch={args.batch}) ...') watch.start() embedding = Word2VecEmbedding(filepath=embedding_file, vocab=corpus.vocab) embedding_file = trainer.train(iterations=args.epoch, batch=args.batch, embedding=embedding, args=args) log.info( f'embedding_file: {embedding_file} train OK. (elapsed: {watch.elapsed_string()})' ) if is_server(): SlackUtil.send_message( f'embedding_file: {embedding_file} train OK. (elapsed: {watch.elapsed_string()})' ) SlackUtil.send_message( f'[{hostname()}][{args.device_no}] {sys.argv} OK.') except: log.error(traceback.format_exc()) if is_server(): SlackUtil.send_message( f'[{hostname()}][{args.device_no}] {sys.argv} ERROR.') SlackUtil.send_message(traceback.format_exc())
@property def data2text(self): for iword, owords in self.data: yield self.vocab.idx2word[iword], [self.vocab.idx2word[o] for o in owords] if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--text_file', default=WIKIPEDIA_SENTENCE_FILE, type=str, help="corpus file path") parser.add_argument('--data_dir', default=WORD2VEC_DATA_DIR, type=str, help="data directory path (default:'./data')") parser.add_argument('--vocab_file', default=Word2VecVocab.DEFAULT_FILE, type=str) parser.add_argument('--window', default=Word2VecCorpus.WINDOW, type=int, help="window size") parser.add_argument('--side', default=Word2VecCorpus.SIDE, type=str, choices=['both', 'front', 'back'], help="target words in front or back or both (default: both)") args = parser.parse_args() try: log.info(f'vocab_file {args.vocab_file}') if not os.path.exists(args.vocab_file): log.error(f'vocab file does not exists. {args.vocab_file}') vocab = Word2VecVocab.load(args.vocab_file) log.info(vocab) for args.window in [args.window]: # [1, 2, 3, 4, 5]: for args.side in [args.side]: # ['both', 'front', 'back']: log.info(f'window: {args.window} side: {args.side}') corpus = Word2VecCorpus.build(text_file=args.text_file, vocab=vocab, window=args.window, side=args.side, data_dir=args.data_dir) log.info(f'corpus: {corpus.filepath} {NumUtil.comma_str(len(corpus))}') except: log.error(traceback.format_exc())