def test_running(): epochs = 100000 seq_batch_size = 100 print_yes = 100 loss_func = torch.nn.functional.nll_loss # create network and optimizer net = RNN(100, 120, 150, 2) net.to(device) # add cuda to device optim = torch.optim.Adam(net.parameters(), lr=3e-5) # main training loop: for epoch in range(epochs): dat = get_batch(train_data, seq_batch_size) dat = torch.LongTensor([vocab.find(item) for item in dat]) # pull x and y x_t = dat[:-1] y_t = dat[1:] hidden = net.init_hidden() # turn all into cuda x_t, y_t, hidden = x_t.to(device), y_t.to(device), hidden.to(device) # initialize hidden state and forward pass logprob, hidden = net.forward(x_t, hidden) loss = loss_func(logprob, y_t) # update optim.zero_grad() loss.backward() optim.step() # print the loss for every kth iteration if epoch % print_yes == 0: print('*' * 100) print('\n epoch {}, loss:{} \n'.format(epoch, loss)) # make sure to pass True flag for running on cuda print('sample speech:\n', run_words(net, vocab, 500, True))
def generate_sequences(id_2_word, num_samples, model_type, emb_size, hidden_size, seq_len, batch_size, num_layers, dp_keep_prob, vocab_size, path): if model_type=='RNN': model = RNN(emb_size=emb_size, hidden_size=hidden_size, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=num_layers, dp_keep_prob=dp_keep_prob) else: model = GRU(emb_size=emb_size, hidden_size=hidden_size, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=num_layers, dp_keep_prob=dp_keep_prob) model.load_state_dict(torch.load(path)) model = model.to(device) hidden = nn.Parameter(torch.zeros(num_layers, num_samples, hidden_size)).to(device) input = torch.ones(10000)*1/1000 input = torch.multinomial(input, num_samples).to(device) output = model.generate(input, hidden, seq_len) f = open(model_type + '_generated_sequences' +'.txt','w') for i in range(num_samples): for j in range(seq_len): f.write(id_2_word.get(output[j,i].item())+' ') f.write('\n') f.close()
def no_test_forward(): loss_func = torch.nn.functional.nll_loss net = RNN(100, 100, 100) net.to(device) # add cuda to device optim = torch.optim.Adam(net.parameters(), lr=1e-4) # step 2: create a training batch of data, size 101, format this data and convert it to pytorch long tensors dat = get_batch(train_data, 100) dat = torch.LongTensor([vocab.find(item) for item in dat]) # step 3: convert our dat into input/output x_t = dat[:-1] y_t = dat[1:] ho = net.init_hidden() # remember to load all variables used by the model to the device, this means the i/o as well as the hidden state x_t, y_t, ho = x_t.to(device), y_t.to(device), ho.to(device) # test forward pass log_prob, hidden = net.forward(x_t, ho) # let's see if the forward pass of the next hidden state is already cuda #log_prob2, hidden2 = net.forward(x_t, hidden) loss = loss_func(log_prob, y_t) optim.zero_grad() loss.backward() optim.step()
def make_my_model(model_name, device, seq_len=35, batch_size=20, pt=None): # --model=RNN --optimizer=ADAM --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best # --model=GRU --optimizer=SGD_LR_SCHEDULE --initial_lr=10 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best # --model=TRANSFORMER --optimizer=SGD_LR_SCHEDULE --initial_lr=20 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=6 --dp_keep_prob=0.9 --save_best if model_name == 'RNN': model = RNN(emb_size=200, hidden_size=1500, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) elif model_name == 'GRU': model = GRU(emb_size=200, hidden_size=1500, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) elif model_name == 'TRANSFORMER': model = TRANSFORMER(vocab_size=vocab_size, n_units=512, n_blocks=6, dropout=1. - 0.9) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size = 128 model.seq_len = 35 model.vocab_size = vocab_size else: print("ERROR: Model type not recognized.") return # Model to device model = model.to(device) # Load pt if pt is not None: model.load_state_dict(torch.load(pt, map_location=device)) return model
def load_model(model_info, device, vocab_size, emb_size=200, load_on_device=True): params_path = model_info.get_params_path() if model_info.model == 'RNN': model = RNN(emb_size=emb_size, hidden_size=model_info.hidden_size, seq_len=model_info.seq_len, batch_size=model_info.batch_size, vocab_size=vocab_size, num_layers=model_info.num_layers, dp_keep_prob=model_info.dp_keep_prob) elif model_info.model == 'GRU': model = GRU(emb_size=emb_size, hidden_size=model_info.hidden_size, seq_len=model_info.seq_len, batch_size=model_info.batch_size, vocab_size=vocab_size, num_layers=model_info.num_layers, dp_keep_prob=model_info.dp_keep_prob) else: model = TRANSFORMER(vocab_size=vocab_size, n_units=model_info.hidden_size, n_blocks=model_info.num_layers, dropout=1. - model_info.dp_keep_prob) model.batch_size = model_info.batch_size model.seq_len = model_info.seq_len model.vocab_size = vocab_size if load_on_device: model = model.to(device) model.load_state_dict(torch.load(params_path, map_location=device)) return model
# different things here than in the RNNs. # Also, the Transformer also has other hyperparameters # (such as the number of attention heads) which can change it's behavior. model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size, n_blocks=args.num_layers, dropout=1. - args.dp_keep_prob) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size = args.batch_size model.seq_len = args.seq_len model.vocab_size = vocab_size else: print("Model type not recognized.") model = model.to(device) # LOSS FUNCTION loss_fn = nn.CrossEntropyLoss() if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr) # LEARNING RATE SCHEDULE lr = args.initial_lr lr_decay_base = 1 / 1.15 m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs ############################################################################### # # DEFINE COMPUTATIONS FOR PROCESSING ONE EPOCH #
LOG(f"[DATA] Data is loaded. Vocabulary size is {len(word2idx)}") # Model Definition model = RNN(vocab_size=len(word2idx), embedding_dim=128, hidden_dim=256, num_layers=2, target="lstm") optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) criterion = nn.CrossEntropyLoss() loss_meter = tnt.meter.AverageValueMeter() if MODEL_PATH is not None: model.load_state_dict(torch.load(MODEL_PATH)) model.to(device) LOG(f"[MODEL] Build model complete.") # Train if MODE == "train": for epoch in range(EPOCH): loss_meter.reset() for index, data in tqdm.tqdm(enumerate(dataloader, 0)): data = data.long().contiguous().to(device) optimizer.zero_grad() input_, target = data[:, :-1], data[:, 1:] output, _ = model(input_) loss = criterion(output, target.reshape(-1)) loss.backward()