Python BidirectionalLMDataset примеры, bilm.data.BidirectionalLMDataset Python примеры использования

Пример #1

0

Показать файл

def main(train_prefix, vocab_file, save_dir):
    # load the vocab
    vocab = load_vocab(vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768_648_884
    n_train_tokens = 1_246_091

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation': 'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [
                [1, 32],
                [2, 32],
                [3, 64],
            ],
            #[4, 128],
            #[5, 256],
            #[6, 512],
            #[7, 1024]],
            'max_characters_per_token': 50,
            'n_characters': 261,
            'n_highway': 2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            #'projection_dim': 512,
            'projection_dim': 64,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 2,  #10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = save_dir
    tf_log_dir = save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #2

0

Показать файл

def main(args):

    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        if options.get('polyglot'):
            data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    #ipy.embed()

    test(options, ckpt_file, data, batch_size=args.batch_size)

Пример #3

0

Показать файл

def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    permute_number = options.get('permute_number', 4)

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    elif options.get('multidirectional'):
        data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)

Пример #4

0

Показать файл

def main(args):
    print(args)
    print('-' * 100)
    print('Loading models and options...')
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    print('Loading vocabulary...')
    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    shards = glob(args.test_prefix)
    shards.sort()
    # print(shards)
    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }
    print(f'Building dataset...')
    datasets = []
    for shard in shards:
        if options.get('bidirectional'):
            datasets.append(BidirectionalLMDataset(shard, vocab, **kwargs))
        else:
            datasets.append(LMDataset(shard, vocab, **kwargs))

    print('Predicting...')
    tag(options, ckpt_file, shards, datasets, batch_size=args.batch_size)
    print('-' * 100)
    print('done.')

Пример #5

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = args.batch_size  # batch size for each GPU
    n_gpus = args.n_gpus

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.ntokens

    n_negative_samples_batch = 8192
    if n_negative_samples_batch > vocab.size:
        n_negative_samples_batch = int(vocab.size / 2)

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': args.n_epochs,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': n_negative_samples_batch,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    print("NGPUS in train_elmo: %i" % (n_gpus, ))
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #6

0

Показать файл

Файл: run_test.py Проект: cheng18/crs

def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    # vocab = load_vocab(args.vocab_file, max_word_length)
    vocab = load_vocab(args.vocab_file, args.stroke_vocab_file,
                       50)  # Winfred stroke_vocab

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)

Пример #7

0

Показать файл

def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options, data, args.n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)

Пример #8

0

Показать файл

Файл: train_elmo.py Проект: gongxuefei/23_bert

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 2

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768

    options = {
        'bidirectional': True,

        # 'char_cnn': {'activation': 'relu',
        #              'embedding': {'dim': 16},  # 每个字符的embedding表示维数
        #              'filters': [
        #                  [1, 32],
        #                  [2, 32],
        #                  [3, 64],
        #                  [4, 128],
        #                  [5, 256]
        #                  # [6, 512],
        #                  # [7, 1024]
        #              ],
        #              'max_characters_per_token': 50,  # 每个单词最大字符数
        #              'n_characters': 300000,  # 字符字典中总的字符个数，就60个?
        #              'n_highway': 2},  # 使用high way网络
        'dropout': 0.1,
        'lstm': {
            'cell_clip':
            3,  # if provided the cell state is clipped by this value prior to the cell output activation.
            'dim': 4096,  # 隐藏层神经元个数
            'n_layers': 2,
            'proj_clip': 3,
            # If num_proj > 0 and proj_clip is provided, then the projected values are clipped elementwise to within [-proj_clip, proj_clip].
            'projection_dim':
            512,  # num_proj 投影矩阵的输出维数。 如果为None，则不执行投影。#最终维度,投影层维度
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 1,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,  # 输入句子的长度#最大时长,n_token
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    # print("",)
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #9

0

Показать файл

Файл: train_elmo.py Проект: LimKim/Tmp

def main(args):
    # load the vocab
    # vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(args.vocab_file, None)  # load_vocab的第二个参数应该改为None

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 2
    os.environ[
        'CUDA_VISIBLE_DEVICES'] = '4,5'  # n_gpus CUDA_VISIBLE_DEVICES 根据自己需求改

    # number of tokens in training data (this for 1B Word Benchmark)
    # n_train_tokens 可改可不改，影响的是输出信息。要查看自己语料的行数，可以通过wc -l corpus.txt 查看。
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,

        #     'char_cnn': {'activation': 'relu',  # option的修改，将char_cnn部分都注释掉，其他根据自己需求修改
        #      'embedding': {'dim': 16},
        #      'filters': [[1, 32],
        #       [2, 32],
        #       [3, 64],
        #       [4, 128],
        #       [5, 256],
        #       [6, 512],
        #       [7, 1024]],
        #      'max_characters_per_token': 50,
        #      'n_characters': 261,
        #      'n_highway': 2},
        #
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 300,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 300,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 3,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 256,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #10

0

Показать файл

def main(args):
    vocab = load_vocab(args.vocab_file, args.vocab_min_occur)
    train_tokens = 768648884  #(this for 1B Word Benchmark)
    if args.train_tokens == 'wikitext2':
        train_tokens = 2051910  #Enwiki2
    elif args.train_tokens == 'wikitext103':
        train_tokens = 101425658  #wikitext-103
    if args.is_line:
        train_tokens *= 3
    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': train_tokens,
        'batch_size': args.train_batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    train_data = BidirectionalLMDataset(prefix,
                                        vocab,
                                        test=False,
                                        shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          train_data,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          converge=args.converge)

Пример #11

0

Показать файл

Файл: train_elmo.py Проект: zheng5yu9/CCF-BDCI-Automotive-Field-ASC-2018

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, None)

    # define the options
    batch_size = 512  # batch size for each GPU
    n_gpus = 3
    os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6'

    # number of tokens in training data (this for 1B Word Benchmark)
    # word 8799
    # char 2355
    n_train_tokens = 768648884
    # n_train_tokens = 8799

    options = {
        'bidirectional': True,

        #  'char_cnn': {'activation': 'relu',
        #  'embedding': {'dim': 16},
        #  'filters': [[1, 32],
        #  [2, 32],
        #  [3, 64],
        #  [4, 128],
        #  [5, 256],
        #  [6, 512],
        #  [7, 1024]],
        #  'max_characters_per_token': 50,
        #  'n_characters': 261,
        #  'n_highway': 2},
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 1024,
    }

    print('vocab_size:', vocab.size)
    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #12

0

Показать файл

def main(args):
    ckpt_file = None
    if os.path.exists(args.save_dir+'options.json'):
        options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = args.n_gpus

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)

Пример #13

0

Показать файл

def main(args):
    # load the vocab
    max_token_length = 7  #一个词的长度
    vocab = load_vocab(args.vocab_file, max_token_length)

    # define the options
    batch_size = 16  # batch size for each GPU
    n_gpus = 6

    # number of tokens in training data (this for 1B Word Benchmark)
    # n_train_tokens = 768648884
    n_train_tokens = 94268535  #训练集上共有多少个词

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            7,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 20,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #14

0

Показать файл

Файл: train_elmo.py Проект: asimgunes/bilm-tf

def main(args):
    # load the vocab
    vocab_file = os.path.join(args.folder, 'vocabulary.txt')
    vocab = load_vocab(vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = args.gpu

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768648884
    n_train_tokens = args.tokens

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': args.size
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': args.epoch,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = os.path.join(args.folder, 'corpus', '*')
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = os.path.join(args.folder, args.checkpoint)
    tf_log_dir = tf_save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #15

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    #使用GPU的数量
    n_gpus = 2
    #设置在哪两个GPU上运行，它是并行的
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    #训练语料中的词数，是不去重的，这会影响到训练的时间，需要结合自己的训练语料修改
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,
        #中文的去掉
        # 'char_cnn': {'activation': 'relu',
        #  'embedding': {'dim': 16},
        #  'filters': [[1, 32],
        #   [2, 32],
        #   [3, 64],
        #   [4, 128],
        #   [5, 256],
        #   [6, 512],
        #   [7, 1024]],
        #  'max_characters_per_token': 50,
        #  'n_characters': 261,
        #  'n_highway': 2},
        'dropout': 0.1,
        #设置的LSTM的参数，可以修改
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        #一个批次负采样的个数，语料过短时需要修改,修改小点
        'n_negative_samples_batch': 20,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #16

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for indonesia wikidump)
    n_train_tokens = 25766422

    options = {
     'bidirectional': True,
     'char_cnn': {
        'activation': 'relu',
        'embedding': {'dim': 16},
        'filters': [[1, 32],
        [2, 32],
        [3, 64],
        [4, 128],
        [5, 256],
        [6, 512],
        [7, 1024]],
        'max_characters_per_token': 50,
        'n_characters': 261,
        'n_highway': 1
      },
    
     'dropout': 0.1,
    
     'lstm': {
      'use_skip_connections': True,
      'projection_dim': 128,
      'cell_clip': 3,
      'proj_clip': 3,
      'dim': 1024,
      'n_layers': 2
      },
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 4,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192
     #'n_negative_samples_batch': 1024,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #17

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)  #50 is max word length

    # define the options
    batch_size = 200  #TODO: batch size for each GPU.
    n_gpus = 1  #TODO: how many gpus do you  have?

    # number of tokens in training data
    n_train_tokens = 198782  #TODO: update this number to be the total number of tokens in your training data

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,  #TODO: update this to how many epochs you want to run
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #18

0

Показать файл

Файл: restart.py Проект: jungokasai/poly_share

def main(args):

    if args.gpu is not None:
        if ',' in args.gpu:
            args.gpu = args.gpu.split(',')
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    if 'polyglot' in options or args.polyglot:
        polyglot = True
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=polyglot)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        if 'polyglot' in options or args.polyglot:
            data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options,
          data,
          None,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)

Пример #19

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 64  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.n_train_tokens

    options = {
     'bidirectional': True,

     # 'char_cnn': {'activation': 'tanh',
     #  'embedding': {'dim': 4},
     #  'filters': [
     #      [1, 8],
     #      [2, 8],
     #      [3, 16],
     #      [4, 32],
     #      [5, 64],
     #  ],
     #  'max_characters_per_token': 50,
     #  'n_characters': 261,
     #  'n_highway': 1},

     'dropout': 0.1,

     'lstm': {
      'cell_clip': 3,
      'dim': 256,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 64,
      'use_skip_connections': True},

     'all_clip_norm_val': 10.0,

     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 2048,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #20

0

Показать файл

    def _load_data(self, reverse, chars, bidirectional=False):
        if chars:
            vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5)
        else:
            vocab = Vocabulary(self._tmp_vocab)

        if not bidirectional:
            data = LMDataset(self._tmp_train, vocab, reverse=reverse)
        else:
            data = BidirectionalLMDataset(self._tmp_train, vocab)

        return data

Пример #21

0

Показать файл

    def _get_data(self, bidirectional, use_chars, test=False):
        vocab_file = os.path.join(FIXTURES, 'vocab.txt')
        if use_chars:
            vocab = load_vocab(vocab_file, 10)
        else:
            vocab = load_vocab(vocab_file, None)

        prefix = os.path.join(FIXTURES, 'data.txt')

        if bidirectional:
            data = BidirectionalLMDataset(prefix, vocab, test=test)
        else:
            data = LMDataset(prefix, vocab, test=test, reverse=False)

        return data, vocab

Пример #22

0

Показать файл

def main(args):
    print('h0')
    vocab = load_vocab(args.vocab_file, 10)
    print('h1')
    batch_size = 64
    n_gpus = 3
    n_train_tokens = 4775300
    options = {
        'bidirectional': True,

        'char_cnn': {'activation': 'relu',
                     'embedding': {'dim': 16},
                     'filters': [[1, 32],
                                 [2, 32],
                                 [3, 64],
                                 [4, 128],
                                 [5, 256],
                                 [6, 512],
                                 [7, 1024]],
                     'max_characters_per_token': 10,
                     'n_characters': 105047,
                     'n_highway': 2},

        'dropout': 0.1,

        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True},

        'all_clip_norm_val': 10.0,

        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }
    inpattern = args.train_prefix
    data = BidirectionalLMDataset(inpattern, vocab, test=False,
                                  shuffle_on_load=True)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train_with_single_core(options, data, tf_save_dir, tf_log_dir)

Пример #23

0

Показать файл

Файл: train_elmo.py Проект: kdsuneraavinash/elmo

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': args.n_epochs,
     'n_train_tokens': args.n_train_tokens,
     'batch_size': args.batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    train(options, data, args.n_gpus, args.save_dir, args.log_dir)

Пример #24

0

Показать файл

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)
    options = json.load(args.options_file)

    # number of tokens in training data (this for 1B Word Benchmark)

    options['n_tokens_vocab'] = vocab.size

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Пример #25

0

Показать файл

Файл: restarter.py Проект: davidchan2/elmoUser

def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file):
    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }
    tf.reset_default_graph()
    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
    clean_checkpoint(tf_save_dir)

Пример #26

0

Показать файл

Файл: train_elmo_curriculum.py Проект: spacemanidol/CurriculumLearningForLanguageModels

def main(args):
    vocab = load_vocab(args.vocab_file, args.vocab_min_occur)
    train_tokens = 768648884 #(this for 1B Word Benchmark)
    if args.train_tokens == 'wikitext2':
        train_tokens = 2051910 *3 * 1.5 #Enwiki2 is 3x longer if split into sentences and a further 1.5 when using sentence split size of 20
    elif args.train_tokens == 'wikitext103':
        train_tokens = 101425658*3 *1.5 #wikitext-103
    options = {
     'bidirectional': True,
     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
     'dropout': 0.1,
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
     'all_clip_norm_val': 10.0,
     'n_epochs': 10,
     'n_train_tokens': train_tokens,
     'batch_size': args.train_batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=False, curriculum=True, num_steps=20) # we dont shuffle since our curriculum generator shuffles
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train_curriculum(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, args.initial_competence, args.competence_increment, args.target_batches, args.test_prefix, args.test_interval, vocab )

Пример #27

0

Показать файл

Файл: tester.py Проект: davidchan2/elmoUser

def top_level(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    vocab_file = os.path.join(args.save_dir, 'vocabs.txt')

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)

Пример #28

0

Показать файл

def main(args):
    max_token_length = args.max_token_length and int(args.max_token_length)
    print("args.vocab_file: ", args.vocab_file)
    print("max_token_length: ", max_token_length)
    print("args.stroke_vocab_file: ", args.stroke_vocab_file)

    # load the vocab
    # vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(
        args.vocab_file,
        args.stroke_vocab_file,  # Winfred stroke_vocab
        max_token_length)  # Winfred stroke_vocab

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 10731134  # 768648884

    # options = {
    #  'bidirectional': True,

    #  'char_cnn': {'activation': 'relu',
    #   'embedding': {'dim': 16},
    #   'filters': [[1, 32],
    #    [2, 32],
    #    [3, 64],
    #    [4, 128],
    #    [5, 256],
    #    [6, 512],
    #    [7, 1024]],
    #   'max_characters_per_token': max_token_length,
    #   'n_characters': 266, # 原261 + 筆畫5
    #   'n_highway': 2}, # 2

    #  'dropout': 0.1,

    #  'lstm': {
    #   'cell_clip': 3,
    #   'dim': 4096,
    #   'n_layers': 2,
    #   'proj_clip': 3,
    #   'projection_dim': 512,
    #   'use_skip_connections': True},

    #  'all_clip_norm_val': 10.0,

    #  'n_epochs': 1,
    #  'n_train_tokens': n_train_tokens,
    #  'batch_size': batch_size,
    #  'n_tokens_vocab': vocab.size,
    #  'unroll_steps': 20,
    #  'n_negative_samples_batch': 8192,
    # }

    # Add by Winfred
    option_file = os.path.join(args.save_dir, "options.json")
    with open(option_file, "r") as f:
        options = json.load(f)

    if max_token_length:
        options["char_cnn"]["max_characters_per_token"] = max_token_length
        print("Wrong max_token_length, already corrected")
    if "char_cnn" in options:
        options["char_cnn"]["n_characters"] = 266
        print("Wrong n_characters, already corrected")
    # End

    prefix = args.train_prefix
    data = BidirectionalLMDataset(
        prefix,
        vocab,
        test=False,
        shuffle_on_load=False,  # True
        do_record=args.do_record,  # Add by Winfred
        records_path=args.records_path,  # Add by Winfred
        vocab_file=args.vocab_file)  # Add by Winfred

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=args.restart_ckpt_file)

Пример #29

0

Показать файл

def main(args):
    # load the vocab
    # vocab 의 최대 길이 토큰 = 10음절 --> 자모 변환 시 30음절
    # bos char + 30 + eos char = 32
    vocab = load_vocab(args.vocab_file, 32)

    # define the options
    # batch size for each GPU
    batch_size = 64 * 2
    n_gpus = 1

    # 연애의 과학 토크나이징된 카톡 데이터 (identified_corpus_20180105) unique 토큰 개수
    # (-> unique token 개수가 아닌 전체 토큰 수를 넣어야 함)
    # n_train_tokens = 609518
    # n_train_tokens = 626932956  # 8000pair_tokenized_corpus.txt에 등하는 토큰 수 (6.2억개)
    # 임시로 사용하고 있는 토큰 수
    n_train_tokens = 200000000

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'tanh',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            32,
            'n_characters':
            62,
            'n_highway':
            2,
        },
        'dropout': 0.2,
        'lstm': {
            'cell_clip': 3,
            'dim': 256,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 256,
            'use_skip_connections': True,
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 10,
        'n_negative_samples_batch': 4096,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(filepattern=prefix,
                                  vocab=vocab,
                                  test=False,
                                  shuffle_on_load=True,
                                  with_tab=False)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(
        options,
        data,
        n_gpus,
        tf_save_dir,
        tf_log_dir,
        restart_ckpt_file=
        '/media/scatter/scatterdisk/elmo_ckpt/elmo_ckpt_0919_2142/model.ckpt_batch-625000'
    )

Пример #30

0

Показать файл

Файл: train_elmo_configurable.py Проект: piegu/parsing-mbert

def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50, variable=args.variable)
    if args.variable:
        vocab.save_vocab(args.save_dir)

    # define the options
    if args.batch_size > 0:
        batch_size = args.batch_size
    else:
        batch_size = 128  # batch size for each GPU

    if args.n_epochs > 0:
        n_epochs = args.n_epochs
    else:
        n_epochs = 10

    n_gpus = 1

    if args.lang == 'ga':
        n_train_tokens = 3573002
    elif args.lang == 'mt':
        n_train_tokens = 1045392
    elif args.lang == 'sg':
        n_train_tokens = 1196930
    elif args.lang == 'vi':
        n_train_tokens = 5552361
    else:
        raise f'Unrecognized language: {args.lang}'


    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': vocab.n_chars if args.variable else 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': n_epochs,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)

Python BidirectionalLMDataset примеры использования