def best_arch_search(): model.eval() result_df = pd.DataFrame(columns=['Genotype', 'Val_reward']) ntokens = len(corpus.dictionary) i = 0 hidden = model.init_hidden(eval_batch_size) for m in range(search_arch_num): parallel_model.sample_new_architecture() data, targets = get_batch(val_data, i, args) targets = targets.view(-1) hidden = repackage_hidden(hidden) #log_prob, hidden = parallel_model(data, hidden) #loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data loss, hidden = parallel_model._loss(hidden, data, targets) reward = architect.reward_c / torch.exp(loss) gene = parallel_model.genotype() temp_df = pd.DataFrame([[gene, reward.item()]], columns=['Genotype', 'Val_reward']) result_df = result_df.append(temp_df, ignore_index=True) i += args.bptt if i >= search_data.size(0) - 2: i = 0 result_df = result_df.sort_values(by='Val_reward', ascending=False) result_df.to_csv('search_result.csv')
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate(data_source, batch_size=10, data_name='dev'): data_source = DataLoader(args.data_dir + '/dev.json', batch_size, opt, vocab, evaluation=True) print('Evaluating Model!') # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 # ntokens = len(corpus.dictionary) # ntokens = len(vocab.word2id) # for i in range(0, data_source.size(0) - 1, args.bptt): predictions = [] for i in range(len(data_source)): batch = data_source.next_batch() batch_size = len(batch['relation']) hidden = model.init_hidden(batch_size)[0] # data, targets = get_batch(data_source, i, args, evaluation=True) data = batch targets = batch['relation'] targets = targets.view(-1) # print('tokens: {} | hidden: {}'.format(batch['tokens'].shape, hidden.shape)) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss( log_prob, targets).data # log_prob.view(-1, log_prob.size(2)) total_loss += loss * len(data) batch_predictions = torch.argmax(log_prob, dim=-1).cpu().data.numpy() batch_predictions = [ id2label[prediction] for prediction in batch_predictions ] predictions += batch_predictions # hidden = repackage_hidden(hidden) precision, recall, f1 = scorer.score(dev_data.gold(), predictions) logging.info('{} set | Precision: {} | Recall: {} | F1: {}'.format( data_name, precision, recall, f1)) print('total loss: {}'.format(total_loss)) return total_loss / len(data_source)
def train(): assert ( args.batch_size % args.small_batch_size == 0 ), "batch_size must be divisible by small_batch_size" # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] hidden_valid = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.0 # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) seq_len = int(bptt) lr2 = optimizer.param_groups[0]["lr"] optimizer.param_groups[0]["lr"] = lr2 * seq_len / args.bptt model.train() data_valid, targets_valid = get_batch( search_data, i % (search_data.size(0) - 1), args ) data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: # cur_data, cur_targets = ( # data[:, start:end], # targets[:, start:end].contiguous().view(-1), # ) # cur_data_valid, cur_targets_valid = ( # data_valid[:, start:end], # targets_valid[:, start:end].contiguous(), # ) cur_data, cur_targets = (data, targets.contiguous()) cur_data_valid, cur_targets_valid = ( data_valid, targets_valid.contiguous()) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id]) hidden_valid[s_id], grad_norm = architect.step( hidden[s_id], cur_data, cur_targets, hidden_valid[s_id], cur_data_valid, cur_targets_valid, optimizer, args.unrolled, ) # assuming small_batch_size = batch_size so we don't accumulate gradients optimizer.zero_grad() hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden[s_id], return_h=True ) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets ) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum( args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:] ) # Temporal Activation Regularization (slowness) loss = loss + sum( args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:] ) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]["lr"] = lr2 if batch % args.log_interval == 0 and batch > 0: logging.info(parallel_model.genotype()) print(F.softmax(parallel_model.weights, dim=-1)) cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging.info( "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | " "loss {:5.2f} | ppl {:8.2f}".format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]["lr"], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), ) ) total_loss = 0 start_time = time.time() batch += 1 i += seq_len
def train(train_data, dev_data): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' ntokens = len(vocab.word2id) # Turn on training mode which enables dropout. total_loss = 0 total_valid_loss = 0 start_time = time.time() # ntokens = len(corpus.dictionary) # batch, i = 0, 0 for batch in range(len(train_data)): train_batch = train_data.next_batch() dev_batch = dev_data.next_batch() # for batch, (train_batch, dev_batch) in enumerate(zip(train_data, dev_data)): # hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] # hidden_valid = [model.init_hidden(args.small_batch_size) for _ in # range(args.batch_size // args.small_batch_size)] #print('hidden shape: {} | hidden valid: {} |'.format(hidden.shape, hidden_valid.shape)) # while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) # seq_len = int(bptt) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 #* seq_len / args.bptt model.train() # data_valid, targets_valid = get_batch(search_data, i % (search_data.size(0) - 1), args) # data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() # start, end, s_id = 0, args.small_batch_size, 0 cur_data = train_batch cur_targets = train_batch['relation'] cur_data_valid = dev_batch cur_targets_valid = dev_batch['relation'] hidden = model.init_hidden(len(train_batch['relation']))[0] hidden_valid = model.init_hidden(len(dev_batch['relation']))[0] # print('Train Batch Shapes: | Hidden: {} | Tokens: {} |'.format(hidden.shape, cur_data['tokens'].shape)) # print('Dev Batch Shapes: | Hidden: {} | Tokens: {} |'.format(hidden_valid.shape, cur_data_valid['tokens'].shape)) assert hidden.shape[1] == cur_data['tokens'].shape[ 0], 'Hidden shape: {} | tokens shape: {}'.format( hidden.shape, cur_data['tokens'].shape) assert hidden_valid.shape[1] == cur_data_valid['tokens'].shape[ 0], 'Hidden shape: {} | tokens shape: {}'.format( hidden_valid.shape, cur_data_valid['tokens'].shape) # while start < args.batch_size: # cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # cur_data_valid, cur_targets_valid = data_valid[:, start: end], targets_valid[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. # hidden[s_id] = repackage_hidden(hidden[s_id]) # hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id]) #print(hidden.shape) #hidden = repackage_hidden(hidden) #hidden_valid = repackage_hidden(hidden_valid) # hidden_valid[s_id], grad_norm = architect.step( # hidden[s_id], cur_data, cur_targets, # hidden_valid[s_id], cur_data_valid, cur_targets_valid, # optimizer, # args.unrolled) hidden_valid, valid_loss = architect.step(hidden, cur_data, cur_targets, hidden_valid, cur_data_valid, cur_targets_valid, optimizer, args.unrolled) total_valid_loss += valid_loss.data # print('Finished architect step...') # assuming small_batch_size = batch_size so we don't accumulate gradients optimizer.zero_grad() # hidden[s_id] = repackage_hidden(hidden[s_id]) #hidden = repackage_hidden(hidden) # log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True) # print('Entering model training...') hidden = torch.autograd.Variable(hidden.data) # Hidden should be all zeros print('hidden all zeros?: (not {})'.format(torch.sum(hidden))) log_prob, hidden, rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden, return_h=True) # print('received predictions') raw_loss = nn.functional.nll_loss(log_prob, cur_targets) # print('received loss' ) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) # loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data # * args.small_batch_size / args.batch_size loss.backward() # s_id += 1 # start = end # end = start + args.small_batch_size # print('backpropogated...') gc.collect() # print('garbage collected...') # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # print('clipped gradients...') optimizer.step() # print('updated gradients...') # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0: # and batch > 0: logging.info(parallel_model.genotype()) print(F.softmax(parallel_model.weights, dim=-1)) #print('total loss: {}'.format(type(total_loss))) #print('total loss: {}'.format(total_loss)) #print('total loss: {}'.format(total_loss.shape)) #cur_loss = total_loss[0] / args.log_interval cur_loss = total_loss / args.log_interval cur_valid_loss = total_valid_loss / args.log_interval elapsed = time.time() - start_time logging.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | valid loss: {:5.2f} | valid ppl: {:5.2f}' .format(epoch, batch, len(train_data), optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_valid_loss, math.exp(cur_valid_loss))) total_loss = 0 start_time = time.time() # print('on to next batch...') # batch += 1 # i += seq_len print('Reached end of epoch training!')
def train_arch(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden_valid = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 ep_loss = 0 model.eval() while i < search_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) seq_len = int(bptt) data_valid, targets_valid = get_batch(search_data, i, args) start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data_valid, cur_targets_valid = data_valid[:, start: end], targets_valid[:, start: end].contiguous( ).view( -1 ) hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id]) parallel_model.sample_new_architecture() if i == 0: for e in model.edge_weights: print(F.softmax(e, dim=-1)) print(F.softmax(model.weights, dim=-1)) print(model.baseline) if (batch + 1) % arch_opt_step == 0: is_opt_step = True else: is_opt_step = False if i == 0: architect.optimizer.zero_grad() hidden_valid[s_id], raw_loss = architect.step( hidden_valid[s_id], cur_data_valid, cur_targets_valid, is_opt_step) raw_loss, hidden_valid[s_id] = model._loss(hidden_valid[s_id], cur_data_valid, cur_targets_valid) raw_loss = raw_loss.detach() loss = raw_loss total_loss += raw_loss.data * args.small_batch_size / args.batch_size ep_loss += raw_loss * len(cur_data_valid) s_id += 1 start = end end = start + args.small_batch_size gc.collect() # total_loss += raw_loss.data if batch % args.log_interval == 0 and batch > 0: logging.info(parallel_model.genotype()) print(F.softmax(parallel_model.weights, dim=-1)) cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging.info( '| arch_epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(search_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() batch += 1 i += seq_len #Optimizer step for residual of valid queue if not is_opt_step: architect.optimizer.step() return ep_loss.item() / len(search_data)
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 model.train() while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) seq_len = int(bptt) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt data, targets = get_batch(train_data, i, args, seq_len=seq_len) start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start:end], targets[:, start: end].contiguous( ).view(-1) optimizer.zero_grad() hidden[s_id] = repackage_hidden(hidden[s_id]) parallel_model.sample_new_architecture() log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging.info( '| dag_epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() batch += 1 i += seq_len