def main(train_prefix, vocab_file, save_dir): # load the vocab vocab = load_vocab(vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = 3 # number of tokens in training data (this for 1B Word Benchmark) #n_train_tokens = 768_648_884 n_train_tokens = 1_246_091 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [ [1, 32], [2, 32], [3, 64], ], #[4, 128], #[5, 256], #[6, 512], #[7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, #'projection_dim': 512, 'projection_dim': 64, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 2, #10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = save_dir tf_log_dir = save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): if args.gpu is not None: n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): if options.get('polyglot'): data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs) else: data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) #ipy.embed() test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } permute_number = options.get('permute_number', 4) if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) elif options.get('multidirectional'): data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
def main(args): print(args) print('-' * 100) print('Loading models and options...') options, ckpt_file = load_options_latest_checkpoint(args.save_dir) print('Loading vocabulary...') # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) shards = glob(args.test_prefix) shards.sort() # print(shards) kwargs = { 'test': True, 'shuffle_on_load': False, } print(f'Building dataset...') datasets = [] for shard in shards: if options.get('bidirectional'): datasets.append(BidirectionalLMDataset(shard, vocab, **kwargs)) else: datasets.append(LMDataset(shard, vocab, **kwargs)) print('Predicting...') tag(options, ckpt_file, shards, datasets, batch_size=args.batch_size) print('-' * 100) print('done.')
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = args.batch_size # batch size for each GPU n_gpus = args.n_gpus # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = args.ntokens n_negative_samples_batch = 8192 if n_negative_samples_batch > vocab.size: n_negative_samples_batch = int(vocab.size / 2) options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': args.n_epochs, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': n_negative_samples_batch, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir print("NGPUS in train_elmo: %i" % (n_gpus, )) train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None # vocab = load_vocab(args.vocab_file, max_word_length) vocab = load_vocab(args.vocab_file, args.stroke_vocab_file, 50) # Winfred stroke_vocab test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = 2 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 768 options = { 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 每个字符的embedding表示维数 # 'filters': [ # [1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256] # # [6, 512], # # [7, 1024] # ], # 'max_characters_per_token': 50, # 每个单词最大字符数 # 'n_characters': 300000, # 字符字典中总的字符个数,就60个? # 'n_highway': 2}, # 使用high way网络 'dropout': 0.1, 'lstm': { 'cell_clip': 3, # if provided the cell state is clipped by this value prior to the cell output activation. 'dim': 4096, # 隐藏层神经元个数 'n_layers': 2, 'proj_clip': 3, # If num_proj > 0 and proj_clip is provided, then the projected values are clipped elementwise to within [-proj_clip, proj_clip]. 'projection_dim': 512, # num_proj 投影矩阵的输出维数。 如果为None,则不执行投影。#最终维度,投影层维度 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 1, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, # 输入句子的长度#最大时长,n_token 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # print("",) train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab # vocab = load_vocab(args.vocab_file, 50) vocab = load_vocab(args.vocab_file, None) # load_vocab的第二个参数应该改为None # define the options batch_size = 128 # batch size for each GPU n_gpus = 2 os.environ[ 'CUDA_VISIBLE_DEVICES'] = '4,5' # n_gpus CUDA_VISIBLE_DEVICES 根据自己需求改 # number of tokens in training data (this for 1B Word Benchmark) # n_train_tokens 可改可不改,影响的是输出信息。要查看自己语料的行数,可以通过wc -l corpus.txt 查看。 n_train_tokens = 768648884 options = { 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # option的修改,将char_cnn部分都注释掉,其他根据自己需求修改 # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 2}, # 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 300, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 300, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 3, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 256, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): vocab = load_vocab(args.vocab_file, args.vocab_min_occur) train_tokens = 768648884 #(this for 1B Word Benchmark) if args.train_tokens == 'wikitext2': train_tokens = 2051910 #Enwiki2 elif args.train_tokens == 'wikitext103': train_tokens = 101425658 #wikitext-103 if args.is_line: train_tokens *= 3 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': train_tokens, 'batch_size': args.train_batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, converge=args.converge)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, None) # define the options batch_size = 512 # batch size for each GPU n_gpus = 3 os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6' # number of tokens in training data (this for 1B Word Benchmark) # word 8799 # char 2355 n_train_tokens = 768648884 # n_train_tokens = 8799 options = { 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 1024, } print('vocab_size:', vocab.size) prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): ckpt_file = None if os.path.exists(args.save_dir+'options.json'): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = args.n_gpus # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 768648884 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): # load the vocab max_token_length = 7 #一个词的长度 vocab = load_vocab(args.vocab_file, max_token_length) # define the options batch_size = 16 # batch size for each GPU n_gpus = 6 # number of tokens in training data (this for 1B Word Benchmark) # n_train_tokens = 768648884 n_train_tokens = 94268535 #训练集上共有多少个词 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 7, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 20, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab_file = os.path.join(args.folder, 'vocabulary.txt') vocab = load_vocab(vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = args.gpu # number of tokens in training data (this for 1B Word Benchmark) #n_train_tokens = 768648884 n_train_tokens = args.tokens options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': args.size }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': args.epoch, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = os.path.join(args.folder, 'corpus', '*') data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = os.path.join(args.folder, args.checkpoint) tf_log_dir = tf_save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU #使用GPU的数量 n_gpus = 2 #设置在哪两个GPU上运行,它是并行的 os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' #训练语料中的词数,是不去重的,这会影响到训练的时间,需要结合自己的训练语料修改 n_train_tokens = 768648884 options = { 'bidirectional': True, #中文的去掉 # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 2}, 'dropout': 0.1, #设置的LSTM的参数,可以修改 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, #一个批次负采样的个数,语料过短时需要修改,修改小点 'n_negative_samples_batch': 20, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = 1 # number of tokens in training data (this for indonesia wikidump) n_train_tokens = 25766422 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 1 }, 'dropout': 0.1, 'lstm': { 'use_skip_connections': True, 'projection_dim': 128, 'cell_clip': 3, 'proj_clip': 3, 'dim': 1024, 'n_layers': 2 }, 'all_clip_norm_val': 10.0, 'n_epochs': 4, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192 #'n_negative_samples_batch': 1024, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) #50 is max word length # define the options batch_size = 200 #TODO: batch size for each GPU. n_gpus = 1 #TODO: how many gpus do you have? # number of tokens in training data n_train_tokens = 198782 #TODO: update this number to be the total number of tokens in your training data options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, #TODO: update this to how many epochs you want to run 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): if args.gpu is not None: if ',' in args.gpu: args.gpu = args.gpu.split(',') n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None if 'polyglot' in options or args.polyglot: polyglot = True vocab = load_vocab(args.vocab_files, max_word_length=max_word_length, polyglot=polyglot) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): if 'polyglot' in options or args.polyglot: data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs) else: data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, None, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 64 # batch size for each GPU n_gpus = 1 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = args.n_train_tokens options = { 'bidirectional': True, # 'char_cnn': {'activation': 'tanh', # 'embedding': {'dim': 4}, # 'filters': [ # [1, 8], # [2, 8], # [3, 16], # [4, 32], # [5, 64], # ], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 1}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 256, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 64, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 2048, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def _load_data(self, reverse, chars, bidirectional=False): if chars: vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5) else: vocab = Vocabulary(self._tmp_vocab) if not bidirectional: data = LMDataset(self._tmp_train, vocab, reverse=reverse) else: data = BidirectionalLMDataset(self._tmp_train, vocab) return data
def _get_data(self, bidirectional, use_chars, test=False): vocab_file = os.path.join(FIXTURES, 'vocab.txt') if use_chars: vocab = load_vocab(vocab_file, 10) else: vocab = load_vocab(vocab_file, None) prefix = os.path.join(FIXTURES, 'data.txt') if bidirectional: data = BidirectionalLMDataset(prefix, vocab, test=test) else: data = LMDataset(prefix, vocab, test=test, reverse=False) return data, vocab
def main(args): print('h0') vocab = load_vocab(args.vocab_file, 10) print('h1') batch_size = 64 n_gpus = 3 n_train_tokens = 4775300 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 10, 'n_characters': 105047, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } inpattern = args.train_prefix data = BidirectionalLMDataset(inpattern, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train_with_single_core(options, data, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': args.n_epochs, 'n_train_tokens': args.n_train_tokens, 'batch_size': args.batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) train(options, data, args.n_gpus, args.save_dir, args.log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) options = json.load(args.options_file) # number of tokens in training data (this for 1B Word Benchmark) options['n_tokens_vocab'] = vocab.size prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file): kwargs = { 'test': False, 'shuffle_on_load': True, } tf.reset_default_graph() if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file) clean_checkpoint(tf_save_dir)
def main(args): vocab = load_vocab(args.vocab_file, args.vocab_min_occur) train_tokens = 768648884 #(this for 1B Word Benchmark) if args.train_tokens == 'wikitext2': train_tokens = 2051910 *3 * 1.5 #Enwiki2 is 3x longer if split into sentences and a further 1.5 when using sentence split size of 20 elif args.train_tokens == 'wikitext103': train_tokens = 101425658*3 *1.5 #wikitext-103 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': train_tokens, 'batch_size': args.train_batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=False, curriculum=True, num_steps=20) # we dont shuffle since our curriculum generator shuffles tf_save_dir = args.save_dir tf_log_dir = args.save_dir train_curriculum(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, args.initial_competence, args.competence_increment, args.target_batches, args.test_prefix, args.test_interval, vocab )
def top_level(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) vocab_file = os.path.join(args.save_dir, 'vocabs.txt') # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): max_token_length = args.max_token_length and int(args.max_token_length) print("args.vocab_file: ", args.vocab_file) print("max_token_length: ", max_token_length) print("args.stroke_vocab_file: ", args.stroke_vocab_file) # load the vocab # vocab = load_vocab(args.vocab_file, 50) vocab = load_vocab( args.vocab_file, args.stroke_vocab_file, # Winfred stroke_vocab max_token_length) # Winfred stroke_vocab # define the options batch_size = 128 # batch size for each GPU n_gpus = 1 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 10731134 # 768648884 # options = { # 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': max_token_length, # 'n_characters': 266, # 原261 + 筆畫5 # 'n_highway': 2}, # 2 # 'dropout': 0.1, # 'lstm': { # 'cell_clip': 3, # 'dim': 4096, # 'n_layers': 2, # 'proj_clip': 3, # 'projection_dim': 512, # 'use_skip_connections': True}, # 'all_clip_norm_val': 10.0, # 'n_epochs': 1, # 'n_train_tokens': n_train_tokens, # 'batch_size': batch_size, # 'n_tokens_vocab': vocab.size, # 'unroll_steps': 20, # 'n_negative_samples_batch': 8192, # } # Add by Winfred option_file = os.path.join(args.save_dir, "options.json") with open(option_file, "r") as f: options = json.load(f) if max_token_length: options["char_cnn"]["max_characters_per_token"] = max_token_length print("Wrong max_token_length, already corrected") if "char_cnn" in options: options["char_cnn"]["n_characters"] = 266 print("Wrong n_characters, already corrected") # End prefix = args.train_prefix data = BidirectionalLMDataset( prefix, vocab, test=False, shuffle_on_load=False, # True do_record=args.do_record, # Add by Winfred records_path=args.records_path, # Add by Winfred vocab_file=args.vocab_file) # Add by Winfred tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=args.restart_ckpt_file)
def main(args): # load the vocab # vocab 의 최대 길이 토큰 = 10음절 --> 자모 변환 시 30음절 # bos char + 30 + eos char = 32 vocab = load_vocab(args.vocab_file, 32) # define the options # batch size for each GPU batch_size = 64 * 2 n_gpus = 1 # 연애의 과학 토크나이징된 카톡 데이터 (identified_corpus_20180105) unique 토큰 개수 # (-> unique token 개수가 아닌 전체 토큰 수를 넣어야 함) # n_train_tokens = 609518 # n_train_tokens = 626932956 # 8000pair_tokenized_corpus.txt에 등하는 토큰 수 (6.2억개) # 임시로 사용하고 있는 토큰 수 n_train_tokens = 200000000 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'tanh', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 32, 'n_characters': 62, 'n_highway': 2, }, 'dropout': 0.2, 'lstm': { 'cell_clip': 3, 'dim': 256, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 256, 'use_skip_connections': True, }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 10, 'n_negative_samples_batch': 4096, } prefix = args.train_prefix data = BidirectionalLMDataset(filepattern=prefix, vocab=vocab, test=False, shuffle_on_load=True, with_tab=False) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train( options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file= '/media/scatter/scatterdisk/elmo_ckpt/elmo_ckpt_0919_2142/model.ckpt_batch-625000' )
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50, variable=args.variable) if args.variable: vocab.save_vocab(args.save_dir) # define the options if args.batch_size > 0: batch_size = args.batch_size else: batch_size = 128 # batch size for each GPU if args.n_epochs > 0: n_epochs = args.n_epochs else: n_epochs = 10 n_gpus = 1 if args.lang == 'ga': n_train_tokens = 3573002 elif args.lang == 'mt': n_train_tokens = 1045392 elif args.lang == 'sg': n_train_tokens = 1196930 elif args.lang == 'vi': n_train_tokens = 5552361 else: raise f'Unrecognized language: {args.lang}' options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': vocab.n_chars if args.variable else 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': n_epochs, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)