vocab = len(dic) print("Vocab=%d" %vocab) X_train_batch = drop_tail(X_train_batch, seq_len) X_val_batch = drop_tail(X_val_batch, seq_len) #print 'ddddd' print X_train_batch.shape print X_val_batch.shape model = lstm.setup_rnn_model(mx.cpu(), num_lstm_layer=num_lstm_layer, seq_len=seq_len, num_hidden=num_hidden, num_embed=num_embed, num_label=vocab, batch_size=batch_size, input_size=vocab, initializer=mx.initializer.Uniform(0.1),dropout=0.5) # max_grad_norm=5.0 | update_period=1 | wd=0 | learning_rate=0.1 | num_roud=25 lstm.train_lstm(model, X_train_batch, X_val_batch, num_round=num_round, half_life=2, max_grad_norm = max_grad_norm, update_period=update_period, learning_rate=learning_rate, wd=wd) # momentum=momentum)
# A simple two GPU placement plan group2ctx = {'embed': mx.gpu(0), 'decode': mx.gpu(ngpu - 1)} for i in range(num_lstm_layer): group2ctx['layer%d' % i] = mx.gpu(i * ngpu // num_lstm_layer) # whether do group-wise concat concat_decode = False use_loss = True model = lstm.setup_rnn_model(mx.gpu(), group2ctx=group2ctx, concat_decode=concat_decode, use_loss=use_loss, num_lstm_layer=num_lstm_layer, seq_len=X_train_batch.default_bucket_key, num_hidden=num_hidden, num_embed=num_embed, num_label=vocab, batch_size=batch_size, input_size=vocab, initializer=mx.initializer.Uniform(0.1), dropout=0.5, buckets=buckets) lstm.train_lstm(model, X_train_batch, X_val_batch, num_round=num_round, concat_decode=concat_decode, use_loss=use_loss, half_life=2, max_grad_norm=max_grad_norm,
# A simple two GPU placement plan group2ctx = {'embed': mx.gpu(0), 'decode': mx.gpu(ngpu - 1)} for i in range(num_lstm_layer): group2ctx['layer%d' % i] = mx.gpu(i * ngpu // num_lstm_layer) # whether do group-wise concat concat_decode = False use_loss=True model = lstm.setup_rnn_model(mx.gpu(), group2ctx=group2ctx, concat_decode=concat_decode, use_loss=use_loss, num_lstm_layer=num_lstm_layer, seq_len=X_train_batch.default_bucket_key, num_hidden=num_hidden, num_embed=num_embed, num_label=vocab, batch_size=batch_size, input_size=vocab, initializer=mx.initializer.Uniform(0.1),dropout=0.5, buckets=buckets) lstm.train_lstm(model, X_train_batch, X_val_batch, num_round=num_round, concat_decode=concat_decode, use_loss=use_loss, half_life=2, max_grad_norm = max_grad_norm, update_period=update_period, learning_rate=learning_rate, batch_size = batch_size,
X_train, dic = load_data("./data/ptb.train.txt") X_val, _ = load_data("./data/ptb.valid.txt", dic) X_train_batch = replicate_data(X_train, batch_size) X_val_batch = replicate_data(X_val, batch_size) vocab = len(dic) print("Vocab=%d" % vocab) X_train_batch = drop_tail(X_train_batch, seq_len) X_val_batch = drop_tail(X_val_batch, seq_len) model = lstm.setup_rnn_model(mx.gpu(), num_lstm_layer=num_lstm_layer, seq_len=seq_len, num_hidden=num_hidden, num_embed=num_embed, num_label=vocab, batch_size=batch_size, input_size=vocab, initializer=mx.initializer.Uniform(0.1), dropout=0.5) lstm.train_lstm(model, X_train_batch, X_val_batch, num_round=num_round, half_life=2, max_grad_norm=max_grad_norm, update_period=update_period, learning_rate=learning_rate, wd=wd, momentum=momentum)
ngpu = 1 # A simple two GPU placement plan group2ctx = {'embed': mx.gpu(0), 'decode': mx.gpu(ngpu - 1)} for i in range(num_lstm_layer): group2ctx['layer%d' % i] = mx.gpu(i * ngpu // num_lstm_layer) # whether do group-wise concat concat_decode = True model = lstm.setup_rnn_model(mx.gpu(), group2ctx=group2ctx, concat_decode=concat_decode, num_lstm_layer=num_lstm_layer, seq_len=seq_len, num_hidden=num_hidden, num_embed=num_embed, num_label=vocab, batch_size=batch_size, input_size=vocab, initializer=mx.initializer.Uniform(0.1),dropout=0.5) lstm.train_lstm(model, X_train_batch, X_val_batch, num_round=num_round, concat_decode=concat_decode, half_life=2, max_grad_norm = max_grad_norm, update_period=update_period, learning_rate=learning_rate, wd=wd)