# Test Kaggle learntools from learntools.core import binder binder.bind(globals()) from learntools.python.ex1 import * color = "blue" q0.check() print("learntools ok") # PyTorch smoke test based on http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html import torch import torch.nn as tnn import torch.autograd as autograd torch.manual_seed(31337) linear_torch = tnn.Linear(5, 3) data_torch = autograd.Variable(torch.randn(2, 5)) print(linear_torch(data_torch)) print("PyTorch ok") import fastai from fastai.io import get_data print("fast.ai ok") import numpy as np print("Numpy imported ok") print("Your lucky number is: " + str(np.random.randint(100))) # Numpy must be linked to the MKL. (Occasionally, a third-party package will muck up the installation # and numpy will be reinstalled with an OpenBLAS backing.) from numpy.distutils.system_info import get_info # This will throw an exception if the MKL is not linked correctly.
def look_up_embed(self, id): lookup_tensor = torch.LongTensor([id - 1]).cuda() return self.embeds(autograd.Variable(lookup_tensor))
os.path.join(parts[i], trial) for trial in os.listdir(parts[i]) ] trials += tmp_trial running_loss = 0 while True: trial_order = np.random.permutation(trials) for itr in trial_order: gd.load_csv(os.path.join(itr, GAZE_NAME)) inds = range(max(WINDOWS), len(gd.data) - 1, 1) order = np.random.permutation(inds) batches = load_batches(order, BATCH_SIZE, gd.data) for i in order: b1, b2, b3, lbl = batches.next() b1 = ag.Variable(b1) b2 = ag.Variable(b2) b3 = ag.Variable(b3) lbl = ag.Variable(lbl) optimizer.zero_grad() print('forward') out = gazenet(b1, b2, b3) print('loss') loss = criterion(out, lbl) print('backward') loss.backward() print('optimise') optimizer.step() print(loss.data[0]) if loss.data[0] < lowest_loss:
def prepare_sequence(seq, to_ix): idxs = [to_ix[w] for w in seq] tensor = torch.LongTensor(idxs) return autograd.Variable(tensor)
def run_epoch(data_loader, train_model, model, gen, optimizer, step, args): ''' Train model for one pass of train data, and return loss, acccuracy ''' eval_model = not train_model data_iter = data_loader.__iter__() losses = [] obj_losses = [] k_selection_losses = [] k_continuity_losses = [] preds = [] golds = [] losses = [] if train_model: model.train() gen.train() else: gen.eval() model.eval() num_batches_per_epoch = len(data_iter) if train_model: num_batches_per_epoch = min(len(data_iter), 10000) for _ in tqdm.tqdm(range(num_batches_per_epoch)): batch = data_iter.next() if train_model: step += 1 if step % 100 == 0 or args.debug_mode: args.gumbel_temprature = max( np.exp((step+1) *-1* args.gumbel_decay), .05) x_indx = utils.get_x_indx(batch, args, eval_model) text = batch['text'] indices = batch['i'] y = autograd.Variable(batch['y'], volatile=eval_model) if args.cuda: x_indx, y = x_indx.cuda(), y.cuda() if train_model: optimizer.zero_grad() if args.get_rationales: mask, z = gen(x_indx) else: mask = None logit, _ = model(x_indx, mask=mask) loss = get_loss(logit, y, args) obj_loss = loss if args.get_rationales: selection_cost, continuity_cost = gen.loss(mask, x_indx) loss += args.selection_lambda* selection_cost loss += args.continuity_lambda* continuity_cost if train_model: loss.backward() optimizer.step() if args.get_rationales: k_selection_losses.append( generic.tensor_to_numpy(selection_cost)) k_continuity_losses.append( generic.tensor_to_numpy(continuity_cost)) obj_losses.append(generic.tensor_to_numpy(obj_loss)) losses.append( generic.tensor_to_numpy(loss) ) preds.extend( torch.max(logit.data, 1)[1].view(y.size()).cpu().numpy()) # Record predictions golds.extend(batch['y'].numpy()) if args.objective in ['cross_entropy', 'margin']: metric = sklearn.metrics.accuracy_score(y_true=golds, y_pred=preds) confusion_matrix = sklearn.metrics.confusion_matrix(y_true=golds,y_pred=preds) elif args.objective == 'mse': metric = sklearn.metrics.mean_squared_error(y_true=golds, y_pred=preds) confusion_matrix = "NA" epoch_stat = { 'loss' : np.mean(losses), 'obj_loss': np.mean(obj_losses), 'metric':metric, 'confusion_matrix': confusion_matrix } if args.get_rationales: epoch_stat['k_selection_loss'] = np.mean(k_selection_losses) epoch_stat['k_continuity_loss'] = np.mean(k_continuity_losses) return epoch_stat, step, losses, preds, golds
super(LSTMpred, self).__init__() self.hidden_dim = hidden_dim self.hidden_layers = hidden_layers self.input_size = input_size self.output_size = output_size self.lstm = nn.LSTM(input_size, self.hidden_dim, hidden_layers, batch_first=True) #lstm = nn.LSTM(input_dim, lstmoutput/hidden_dim, num_of_layers) self.hidden2out = nn.Linear(self.hidden_dim, output_size) def init_hidden(self, batch): return (autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim)), # (num_layers * num_directions, batch size, hidden_size) autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim))) def forward(self, batch_in, lengths): # print("inputs len", batch_in.size(),lengths) self.hidden = self.init_hidden(batch_in.size(0)) pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, lengths, batch_first=True) packed_output, (ht, ct) = self.lstm(pack, self.hidden) unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True) final_out = (self.hidden2out((unpacked))) return final_out def pad_seq(sequence): ordered = sorted(sequence, key=len, reverse=True) lengths = [len(x) for x in ordered]
import sys # Get the arguments # open the argument parsing results (discourse arguments) if len(sys.argv) != 3: print("USAGE> Prepare_Label_Vecs.py [label_file] [dataset_name]") ce_file = open(sys.argv[1], "r") ce_csv = csv.reader(ce_file) tweet_id = 0 ce_vec_seqs = [] ce_vec = [] next(ce_csv) # skip the header for line in ce_csv: causality_vec = [] if tweet_id != int(line[0]): ce_vec_seqs.append(autograd.Variable(torch.LongTensor(ce_vec))) ce_vec = [] ce_vec.append(int(line[2])) tweet_id = int(line[0]) ce_vec_seqs.append(autograd.Variable( torch.LongTensor(ce_vec))) # input the final ce vec for the final tweet del (ce_vec_seqs[0]) # delete the first empty element pickle.dump( ce_vec_seqs, open("causal_explanation_da_labels_" + sys.argv[2] + ".list", "wb"))
def make_context_vector(context, word_to_idx): idx = [word_to_idx[w] for w in context] tensor = torch.LongTensor(idx) return autograd.Variable(tensor)
def make_target_vector(target, word_to_idx): idx = [word_to_idx[target]] tensor = torch.LongTensor(idx) return autograd.Variable(tensor)
def init_hidden(self, batch_size): h0 = autograd.Variable(torch.zeros(1, batch_size, self.hidden_size)).cuda() c0 = autograd.Variable(torch.zeros(1, batch_size, self.hidden_size)).cuda() return h0, c0
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best.top.pair" print >> sys.stderr,"Read model from ",best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda() net_copy(network_model,best_network_model) best_network_file = "./model/network_model_pretrain.best.top.ana" print >> sys.stderr,"Read model from ",best_network_file best_network_model = torch.load(best_network_file) ana_network = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda() net_copy(ana_network,best_network_model) reduced="" if args.reduced == 1: reduced="_reduced" print >> sys.stderr,"prepare data for train ..." train_docs_iter = DataReader.DataGnerater("train"+reduced) print >> sys.stderr,"prepare data for dev and test ..." dev_docs_iter = DataReader.DataGnerater("dev"+reduced) test_docs_iter = DataReader.DataGnerater("test"+reduced) print "Performance after pretraining..." print "DEV" metric = performance.performance(dev_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "TEST" metric = performance.performance(test_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "***" print sys.stdout.flush() l2_lambda = 1e-6 #lr = 0.00001 #lr = 0.000005 lr = 0.000002 #lr = 0.0000009 dropout_rate = 0.5 shuffle = True times = 0 reinforce = True model_save_dir = "./model/reinforce/" utils.mkdir(model_save_dir) score_softmax = nn.Softmax() optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps = 1e-6) ana_optimizer = optim.RMSprop(ana_network.parameters(), lr=lr, eps = 1e-6) scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5) ana_scheduler = lr_scheduler.StepLR(ana_optimizer, step_size=15, gamma=0.5) for echo in range(30): start_time = timeit.default_timer() print "Pretrain Epoch:",echo scheduler.step() ana_scheduler.step() train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl') docs_by_id = {doc.did: doc for doc in train_docs} print >> sys.stderr,"Link docs ..." tmp_data = [] path = [] for data in train_docs_iter.rl_case_generater(shuffle=True): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,0.0) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents, 0.0) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex] ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex] doc = docs_by_id[rl['did']] for s,e in zip(rl["starts"],rl["ends"]): score = score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)).data.cpu().numpy()[0] pair_score = score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)).data.cpu().numpy()[0] ana_action = utils.sample_action(score) if ana_action == (e-s-1): action = ana_action else: pair_action = utils.sample_action(pair_score*score[:-1]) action = pair_action path.append(action) link = action m1, m2 = rl['ids'][s + link] doc.link(m1, m2) tmp_data.append((mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return)) if rl["end"] == True: doc = docs_by_id[rl['did']] reward = doc.get_f1() inside_index = 0 for mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return in tmp_data: for (start, end) in zip(rl['starts'], rl['ends']): ids = rl['ids'][start:end] ana = ids[0, 1] old_ant = doc.ana_to_ant[ana] doc.unlink(ana) costs = rl['costs'][start:end] for ant_ind in range(end - start): costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1) doc.link(old_ant, ana) cost = 0.0 mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex] ana_optimizer.zero_grad() ana_loss = None i = inside_index for s,e in zip(rl["starts"],rl["ends"]): costs = rl["costs"][s:e] costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) score = torch.squeeze(score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1))) baseline = torch.sum(score*costs) action = path[i] this_cost = torch.log(score[action])*-1.0*(reward-baseline) if ana_loss is None: ana_loss = this_cost else: ana_loss += this_cost i += 1 ana_loss.backward() torch.nn.utils.clip_grad_norm(ana_network.parameters(), 5.0) ana_optimizer.step() mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex] pair_loss = None optimizer.zero_grad() i = inside_index index = 0 for s,e in zip(rl["starts"],rl["ends"]): action = path[i] if (not (action == (e-s-1))) and (anaphoricity_target[index] == 1): costs = rl["costs"][s:e-1] costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) score = torch.squeeze(score_softmax(torch.transpose(scores_reindex[s:e-1],0,1))) baseline = torch.sum(score*costs) this_cost = torch.log(score[action])*-1.0*(reward-baseline) if pair_loss is None: pair_loss = this_cost else: pair_loss += this_cost i += 1 index += 1 if pair_loss is not None: pair_loss.backward() torch.nn.utils.clip_grad_norm(network_model.parameters(), 5.0) optimizer.step() inside_index = i tmp_data = [] path = [] end_time = timeit.default_timer() print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time) print >> sys.stderr, "cost:",cost print >> sys.stderr,"save model ..." torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo) torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo) print "DEV" metric = performance.performance(dev_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "DEV Ana: ",metric["ana"] print "TEST" metric = performance.performance(test_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "TEST Ana: ",metric["ana"] print sys.stdout.flush()
if i %(len(train_data)/Division) == 0: # evaluate eval_result = evaluate(model, dev_data, dictionaries) accuracys.append(eval_result['accuracy']) precisions.append(eval_result['precision']) recalls.append(eval_result['recall']) FB1s.append(eval_result['FB1']) save_model_dictionaries('model', model, dictionaries, opts) # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out # before each instance model.zero_grad() # Step 2. Get our inputs ready for the network, that is, turn them into Variables # of word indices. input_words = autograd.Variable(torch.LongTensor(train_data[index]['words'])) targets = autograd.Variable(torch.LongTensor(train_data[index]['tags'])) # Step 3. Run our forward pass. We combine this step with get_loss function #tag_scores = model(sentence_in) # Step 4. Compute the loss, gradients, and update the parameters by calling loss = model.get_loss(targets, input_words = input_words) epoch_costs.append(loss.data.numpy()) loss.backward() nn.utils.clip_grad_norm(model.parameters(), opts.clip) optimizer.step() print("Epoch %i, cost average: %f" % (epoch, np.mean(epoch_costs)))
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n num_rewards = len(task_rewards[mode]) full_rollout = True env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards) env_model.load_state_dict(torch.load("env_model_" + mode)) distil_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n) distil_optimizer = optim.Adam(distil_policy.parameters()) imagination = ImaginationCore(1, state_shape, num_actions, num_rewards, env_model, distil_policy, full_rollout=full_rollout) actor_critic = I2A(state_shape, num_actions, num_rewards, 256, imagination, full_rollout=full_rollout) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # env_model = env_model.cuda() # distil_policy = distil_policy.cuda() # actor_critic = actor_critic.cuda() gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e5) rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() current_state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(current_state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1-masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() current_state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, current_state, action.data, reward, masks) _, next_value = actor_critic(autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1) ) distil_logit, _, _, _ = distil_policy.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1) ) distil_loss = 0.01 * (F.softmax(logit).detach() * F.log_softmax(distil_logit)).sum(1).mean() values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() distil_optimizer.zero_grad() distil_loss.backward() optimizer.step() if i_update % 100 == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20,5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "i2a_" + mode)
def feature_size(self): return self.features(autograd.Variable(torch.zeros(1, *self.in_shape))).view(1, -1).size(1)
import math, random import gym import numpy as np import torch import torch.nn as nn import torch.optim as optim import torch.autograd as autograd import torch.nn.functional as F from IPython.display import clear_output import matplotlib.pyplot as plt USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) from collections import deque class ReplayBuffer(object): def __init__(self, capacity): self.buffer = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
def wrap_var(self, x, **kwargs): x = A.Variable(x, **kwargs) return x
def train(model, dataloader, devloader,loss_function=nn.CrossEntropyLoss(), init_lr=0.1, epochs=100, lr_decay_epoch = 30, print_epoch = 10, gpu=False): # Cuda is not critical for this task with low dimensionol inputs if gpu and torch.cuda.is_available(): model.cuda() losses = [] train_accs_top10 = [] dev_losses = [] dev_accs_top10 = [] for epoch in range(epochs): # learning rate decay div, mod = divmod(epoch, lr_decay_epoch) if mod == 0: optimizer = optim.SGD(model.parameters(), lr=init_lr*(0.1)**div) total_loss = torch.Tensor([0]) total_dev_loss = torch.Tensor([0]) # iterate the dataset to load context heroes(team) and center hero(target) for teams, targets in dataloader: if gpu and torch.cuda.is_available(): teams = teams.cuda() targets = targets.cuda() # wrap the embeddings of the team and target center hero to Variable inputs = autograd.Variable(teams) targets = autograd.Variable(targets.view(-1)) # zero out the accumulated gradients model.zero_grad() # Run the forward pass out = model(inputs) # Compute your loss function. loss = loss_function(out, targets) # backpropagate and update the embeddings loss.backward() optimizer.step() # record total loss in this epoch total_loss += loss.cpu().data # acc_train_top10 = accuracy_in_train(model,dataloader,batch_size=16) # train_accs_top10.append(acc_train_top10) print("total_loss is %s"%total_loss) # print("total_train_acc is %s"%acc_train_top10) for teams, targets in devloader: if gpu and torch.cuda.is_available(): teams = teams.cuda() targets = targets.cuda() # wrap the embeddings of the team and target center hero to Variable inputs = autograd.Variable(teams) targets = autograd.Variable(targets.view(-1)) # zero out the accumulated gradients # model.zero_grad() # Run the forward pass out = model(inputs) # Compute your loss function. dev_loss = loss_function(out, targets) # print("dev_loss is %s"%dev_loss) # # backpropagate and update the embeddings # loss.backward() # optimizer.step() # record total loss in this epoch total_dev_loss += dev_loss.cpu().data print("total_dev_loss is %s"%total_dev_loss) # acc_dev_top10 = accuracy_in_train(model,devloader,batch_size=16) # print("total_dev_acc is %s"%acc_dev_top10) # dev_accs_top10.append(acc_dev_top10) if epoch % print_epoch == 0: print('epoch: %d, loss: %.3f' % (epoch, total_loss/len(dataloader))) print("dev loss:%s"%str(total_dev_loss/len(devloader))) losses.append(total_loss/len(dataloader)) dev_losses.append(total_dev_loss/len(devloader)) # return losses for plot return np.array(losses),np.array(dev_losses),np.array(train_accs_top10),np.array(dev_accs_top10)
optimizer = optim.SGD(model.parameters(), lr=0.0008, weight_decay=1e-2) n_train_samples = len(X_train) # For n epochs... for epoch in range(N_EPOCHS): total_loss = torch.Tensor([0]) random_indices = np.random.permutation(n_train_samples) for index in random_indices: review = X_train[index] label = int(y_train[index]) # Why doesn't y.astype(int) work?? # Initialize hidden layer hidden = autograd.Variable(torch.zeros((1, 128))) word_vector = autograd.Variable(torch.LongTensor(review)) model.zero_grad() for w in range(word_vector.size()[0]): output, hidden = model(word_vector[w], hidden) loss = loss_function(output, autograd.Variable(torch.LongTensor([label]))) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), MAX_NORM) optimizer.step() total_loss += loss.data print(torch.norm(next(model.parameters()).grad)) print("[epoch {}] {}".format(epoch, total_loss)) #print(losses) # The loss decreased every iteration over the training data!
def init_hidden(self, batch): return (autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim)), # (num_layers * num_directions, batch size,
def _viterbi_decode_nbest(self, feats, mask, nbest): """ input: feats: (batch, seq_len, self.tag_size+2) mask: (batch, seq_len) output: decode_idx: (batch, nbest, seq_len) decoded sequence path_score: (batch, nbest) corresponding score for each sequence (to be implementated) nbest decode for sentence with one token is not well supported, to be optimized """ batch_size = feats.size(0) seq_len = feats.size(1) tag_size = feats.size(2) assert (tag_size == self.tagset_size + 2) ## calculate sentence length for each sentence length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() ## mask to (seq_len, batch_size) mask = mask.transpose(1, 0).contiguous() ins_num = seq_len * batch_size ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) feats = feats.transpose(1, 0).contiguous().view( ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) ## need to consider start scores = feats + self.transitions.view(1, tag_size, tag_size).expand( ins_num, tag_size, tag_size) scores = scores.view(seq_len, batch_size, tag_size, tag_size) # build iter seq_iter = enumerate(scores) ## record the position of best score back_points = list() partition_history = list() ## reverse mask (bug for mask = 1- mask, use this as alternative choice) # mask = 1 + (-1)*mask mask = (1 - mask.long()).bool() _, inivalues = next( seq_iter) # bat_size * from_target_size * to_target_size # only need start from start_tag partition = inivalues[:, START_TAG, :].clone( ) # bat_size * to_target_size ## initial partition [batch_size, tag_size] partition_history.append( partition.view(batch_size, tag_size, 1).expand(batch_size, tag_size, nbest)) # iter over last scores for idx, cur_values in seq_iter: if idx == 1: cur_values = cur_values.view( batch_size, tag_size, tag_size) + partition.contiguous().view( batch_size, tag_size, 1).expand( batch_size, tag_size, tag_size) else: # previous to_target is current from_target # partition: previous results log(exp(from_target)), #(batch_size * nbest * from_target) # cur_values: batch_size * from_target * to_target cur_values = cur_values.view( batch_size, tag_size, 1, tag_size).expand( batch_size, tag_size, nbest, tag_size) + partition.contiguous().view( batch_size, tag_size, nbest, 1).expand( batch_size, tag_size, nbest, tag_size) ## compare all nbest and all from target cur_values = cur_values.view(batch_size, tag_size * nbest, tag_size) # print "cur size:",cur_values.size() partition, cur_bp = torch.topk(cur_values, nbest, 1) ## cur_bp/partition: [batch_size, nbest, tag_size], id should be normize through nbest in following backtrace step # print partition[:,0,:] # print cur_bp[:,0,:] # print "nbest, ",idx if idx == 1: cur_bp = cur_bp * nbest partition = partition.transpose(2, 1) cur_bp = cur_bp.transpose(2, 1) # print partition # exit(0) #partition: (batch_size * to_target * nbest) #cur_bp: (batch_size * to_target * nbest) Notice the cur_bp number is the whole position of tag_size*nbest, need to convert when decode partition_history.append(partition) ## cur_bp: (batch_size,nbest, tag_size) topn source score position in current tag ## set padded label as 0, which will be filtered in post processing ## mask[idx] ? mask[idx-1] cur_bp.masked_fill_( mask[idx].view(batch_size, 1, 1).expand(batch_size, tag_size, nbest), 0) # print cur_bp[0] back_points.append(cur_bp) ### add score to final STOP_TAG partition_history = torch.cat(partition_history, 0).view( seq_len, batch_size, tag_size, nbest).transpose( 1, 0).contiguous() ## (batch_size, seq_len, nbest, tag_size) ### get the last position for each setences, and select the last partitions using gather() last_position = length_mask.view(batch_size, 1, 1, 1).expand( batch_size, 1, tag_size, nbest) - 1 last_partition = torch.gather(partition_history, 1, last_position).view( batch_size, tag_size, nbest, 1) ### calculate the score from last partition to end state (and then select the STOP_TAG from it) last_values = last_partition.expand( batch_size, tag_size, nbest, tag_size) + self.transitions.view( 1, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) last_values = last_values.view(batch_size, tag_size * nbest, tag_size) end_partition, end_bp = torch.topk(last_values, nbest, 1) ## end_partition: (batch, nbest, tag_size) end_bp = end_bp.transpose(2, 1) # end_bp: (batch, tag_size, nbest) pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size, nbest)).long() if self.gpu: pad_zero = pad_zero.cuda() back_points.append(pad_zero) back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size, nbest) ## select end ids in STOP_TAG pointer = end_bp[:, STOP_TAG, :] ## (batch_size, nbest) insert_last = pointer.contiguous().view( batch_size, 1, 1, nbest).expand(batch_size, 1, tag_size, nbest) back_points = back_points.transpose(1, 0).contiguous() ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values # print "lp:",last_position # print "il:",insert_last[0] # exit(0) ## copy the ids of last position:insert_last to back_points, though the last_position index ## last_position includes the length of batch sentences # print "old:", back_points[9,0,:,:] back_points.scatter_(1, last_position, insert_last) ## back_points: [batch_size, seq_length, tag_size, nbest] # print "new:", back_points[9,0,:,:] # exit(0) # print pointer[2] ''' back_points: in simple demonstratration x,x,x,x,x,x,x,x,x,7 x,x,x,x,x,4,0,0,0,0 x,x,6,0,0,0,0,0,0,0 ''' back_points = back_points.transpose(1, 0).contiguous() # print back_points[0] ## back_points: (seq_len, batch, tag_size, nbest) ## decode from the end, padded position ids are 0, which will be filtered in following evaluation decode_idx = autograd.Variable( torch.LongTensor(seq_len, batch_size, nbest)) if self.gpu: decode_idx = decode_idx.cuda() decode_idx[-1] = pointer.data / nbest # print "pointer-1:",pointer[2] # exit(0) # use old mask, let 0 means has token for idx in range(len(back_points) - 2, -1, -1): # print "pointer: ",idx, pointer[3] # print "back:",back_points[idx][3] # print "mask:",mask[idx+1,3] new_pointer = torch.gather( back_points[idx].view(batch_size, tag_size * nbest), 1, pointer.contiguous().view(batch_size, nbest)) decode_idx[idx] = new_pointer.data / nbest # # use new pointer to remember the last end nbest ids for non longest pointer = new_pointer + pointer.contiguous().view( batch_size, nbest) * mask[idx].view(batch_size, 1).expand( batch_size, nbest).long() # exit(0) path_score = None decode_idx = decode_idx.transpose(1, 0) ## decode_idx: [batch, seq_len, nbest] # print decode_idx[:,:,0] # print "nbest:",nbest # print "diff:", decode_idx[:,:,0]- decode_idx[:,:,4] # print decode_idx[:,0,:] # exit(0) ### calculate probability for each sequence scores = end_partition[:, :, STOP_TAG] ## scores: [batch_size, nbest] max_scores, _ = torch.max(scores, 1) minus_scores = scores - max_scores.view(batch_size, 1).expand( batch_size, nbest) path_score = F.softmax(minus_scores, 1) ## path_score: [batch_size, nbest] # exit(0) return path_score, decode_idx
#Author: Zhi Zhong import sys import torch import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim torch.manual_seed(1) ## LSTM cell (input->3, output->3) lstm = nn.LSTM(3,3) inputs = [autograd.Variable(torch.randn((1,3))) for _ in range(5)] print(inputs) hidden = (autograd.Variable(torch.randn((1,1,3)), autograd.Variable(torch.randn((1,1,3)))) print(hidden) for i in inputs: out, hidden = lstm(i.view(1,1,-1), hidden) print(out) print(hidden)
def init_hidden(self): # the first is the hidden h # the second is the cell c return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda()), autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda()))
def _viterbi_decode(self, feats, mask): """ input: feats: (batch, seq_len, self.tag_size+2) mask: (batch, seq_len) output: decode_idx: (batch, seq_len) decoded sequence path_score: (batch, 1) corresponding score for each sequence (to be implementated) """ batch_size = feats.size(0) seq_len = feats.size(1) tag_size = feats.size(2) assert (tag_size == self.tagset_size + 2) ## calculate sentence length for each sentence length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() ## mask to (seq_len, batch_size) mask = mask.transpose(1, 0).contiguous() ins_num = seq_len * batch_size ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) feats = feats.transpose(1, 0).contiguous().view( ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) ## need to consider start scores = feats + self.transitions.view(1, tag_size, tag_size).expand( ins_num, tag_size, tag_size) scores = scores.view(seq_len, batch_size, tag_size, tag_size) # build iter seq_iter = enumerate(scores) ## record the position of best score back_points = list() partition_history = list() ## reverse mask (bug for mask = 1- mask, use this as alternative choice) # mask = 1 + (-1)*mask mask = (1 - mask.long()).byte() _, inivalues = seq_iter.__next__( ) # bat_size * from_target_size * to_target_size # only need start from start_tag partition = inivalues[:, START_TAG, :].clone().view( batch_size, tag_size) # bat_size * to_target_size partition_history.append(partition) # iter over last scores for idx, cur_values in seq_iter: # previous to_target is current from_target # partition: previous results log(exp(from_target)), #(batch_size * from_target) # cur_values: batch_size * from_target * to_target cur_values = cur_values + partition.contiguous().view( batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG partition, cur_bp = torch.max(cur_values, 1) partition_history.append(partition) ## cur_bp: (batch_size, tag_size) max source score position in current tag ## set padded label as 0, which will be filtered in post processing cur_bp.masked_fill_( mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) back_points.append(cur_bp) ### add score to final STOP_TAG partition_history = torch.cat(partition_history, 0).view( seq_len, batch_size, -1).transpose(1, 0).contiguous() ## (batch_size, seq_len. tag_size) ### get the last position for each setences, and select the last partitions using gather() last_position = length_mask.view(batch_size, 1, 1).expand( batch_size, 1, tag_size) - 1 last_partition = torch.gather(partition_history, 1, last_position).view( batch_size, tag_size, 1) ### calculate the score from last partition to end state (and then select the STOP_TAG from it) last_values = last_partition.expand( batch_size, tag_size, tag_size) + self.transitions.view( 1, tag_size, tag_size).expand(batch_size, tag_size, tag_size) _, last_bp = torch.max(last_values, 1) pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() if self.gpu: pad_zero = pad_zero.cuda() back_points.append(pad_zero) back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) ## select end ids in STOP_TAG pointer = last_bp[:, STOP_TAG] insert_last = pointer.contiguous().view(batch_size, 1, 1).expand( batch_size, 1, tag_size) back_points = back_points.transpose(1, 0).contiguous() ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values # print "lp:",last_position # print "il:",insert_last back_points.scatter_(1, last_position, insert_last) # print "bp:",back_points # exit(0) back_points = back_points.transpose(1, 0).contiguous() ## decode from the end, padded position ids are 0, which will be filtered if following evaluation decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) if self.gpu: decode_idx = decode_idx.cuda() decode_idx[-1] = pointer.data for idx in range(len(back_points) - 2, -1, -1): pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) decode_idx[idx] = pointer.data path_score = None decode_idx = decode_idx.transpose(1, 0) return path_score, decode_idx
def prepare_sequence(seq, to_ix): idxs = [to_ix[w] for w in seq] idxs = torch.LongTensor(idxs) #print(len(tensor)) #tensor = tensor.view(batch_size,len(tensor)/batch_size) return autograd.Variable(idxs)
def main(): DIR = args.DIR embedding_file = args.embedding_dir #network_file = "./model/model.pkl" network_file = "./model/pretrain/network_model_pretrain.49" if os.path.isfile(network_file): print >> sys.stderr, "Read model from ./model/model.pkl" network_model = torch.load(network_file) else: embedding_matrix = numpy.load(embedding_file) #print len(embedding_matrix) "Building torch model" network_model = network.Network(pair_feature_dimention, mention_feature_dimention, word_embedding_dimention, span_dimention, 1000, embedding_size, embedding_dimention, embedding_matrix).cuda() print >> sys.stderr, "save model ..." torch.save(network_model, network_file) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." train_docs = DataReader.DataGnerater("train" + reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs = DataReader.DataGnerater("dev" + reduced) test_docs = DataReader.DataGnerater("test" + reduced) l2_lambda = 1e-6 lr = 0.00009 dropout_rate = 0.5 shuffle = True times = 0 best_thres = 0.5 model_save_dir = "./model/pretrain/" last_cost = 0.0 all_best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } #for echo in range(30,200): for echo in range(50, 150): start_time = timeit.default_timer() print "Pretrain Epoch:", echo if echo == 100: lr = lr * 0.7 #if echo == 150: # lr = lr/2.0 #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda) #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda) optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5, weight_decay=l2_lambda) pair_cost_this_turn = 0.0 ana_cost_this_turn = 0.0 pair_nums = 0 ana_nums = 0 pos_num = 0 neg_num = 0 inside_time = 0.0 for data in train_docs.train_generater(shuffle=shuffle): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) gold = target.tolist() anaphoricity_gold = anaphoricity_target.tolist() pair_nums += len(gold) ana_nums += len(anaphoricity_gold) lable = autograd.Variable(torch.cuda.FloatTensor([gold])) ana_lable = autograd.Variable( torch.cuda.FloatTensor([anaphoricity_gold])) output, _ = network_model.forward_all_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, dropout_rate) ana_output, _ = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) optimizer.zero_grad() #loss = get_pair_loss(output,positive,negative,train_docs.scale_factor) loss = F.binary_cross_entropy( output, lable, size_average=False) / train_docs.scale_factor ana_loss = F.binary_cross_entropy( ana_output, ana_lable, size_average=False) / train_docs.anaphoricity_scale_factor pair_cost_this_turn += loss.data[0] * train_docs.scale_factor ana_cost_this_turn += ana_loss.data[ 0] * train_docs.anaphoricity_scale_factor loss_all = loss + ana_loss loss_all.backward() optimizer.step() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain epoch", echo, "Pair total cost:", pair_cost_this_turn / float( pair_nums), "Anaphoricity total cost", ana_cost_this_turn / float( ana_nums) print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "Learning Rate", lr print >> sys.stderr, "save model ..." torch.save(network_model, model_save_dir + "network_model_pretrain.%d" % echo) #if cost_this_turn > last_cost: # lr = lr*0.7 gold = [] predict = [] ana_gold = [] ana_predict = [] for data in dev_docs.train_generater(shuffle=False): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative, anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) gold += target.tolist() ana_gold += anaphoricity_target.tolist() output, _ = network_model.forward_all_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, 0.0) predict += output.data.cpu().numpy()[0].tolist() ana_output, _ = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_predict += ana_output.data.cpu().numpy()[0].tolist() gold = numpy.array(gold, dtype=numpy.int32) predict = numpy.array(predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } thresh_list = [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6] for thresh in thresh_list: evaluation_results = get_metrics(gold, predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Pair accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if best_results["f1"] > all_best_results["f1"]: all_best_results = best_results print >> sys.stderr, "New High Result, Save Model" torch.save(network_model, model_save_dir + "network_model_pretrain.best") ana_gold = numpy.array(ana_gold, dtype=numpy.int32) ana_predict = numpy.array(ana_predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } for thresh in thresh_list: evaluation_results = get_metrics(ana_gold, ana_predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if (echo + 1) % 10 == 0: best_network_model = torch.load(model_save_dir + "network_model_pretrain.best") print "DEV:" performance.performance(dev_docs, best_network_model) print "TEST:" performance.performance(test_docs, best_network_model) ## output best print "In sum, anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush()
def forward(self, data): slow_feats = Variable(torch.cuda.FloatTensor(data[1])) moderate_feats = Variable(torch.cuda.FloatTensor(data[2])) fast_feats = Variable(torch.cuda.FloatTensor(data[3])) #slow_feats = Variable(torch.FloatTensor(data[1])) #moderate_feats = Variable(torch.FloatTensor(data[2])) #fast_feats = Variable(torch.FloatTensor(data[3])) slow_feats = slow_feats.unsqueeze( 0) #this is to add a batch size dimension moderate_feats = moderate_feats.unsqueeze(0) fast_feats = fast_feats.unsqueeze(0) # Forward passes #print(slow_feats.shape,moderate_feats.shape,fast_feats.shape) pad_attn_slow = self._forward(slow_feats, 'slow') pad_attn_moderate = self._forward(moderate_feats, 'moderate') pad_attn_fast = self._forward(fast_feats, 'fast') if self.lstm_output_type == 'same': if self.use_second_attention: new_tensor = torch.cuda.FloatTensor(1, 3, self.lstm_hidden_dim) #first_attns = torch.cat(1,(pad_attn_slow, pad_attn_moderate,pad_attn_fast)) #concat to be 1x3xhidden_dim new_tensor[:, 0, :] = pad_attn_slow new_tensor[:, 1, :] = pad_attn_moderate new_tensor[:, 2, :] = pad_attn_fast pad_attn = self.final_attn( (new_tensor, autograd.Variable(torch.cuda.LongTensor( [3])))) #length of 3 always because fast,slow,moderate else: # Concatenate slow, moderate and fast pad_attn = torch.cat( (pad_attn_slow, pad_attn_moderate, pad_attn_fast), 1) #concat to be 1x3*hidden_dim elif self.lstm_output_type == 'different': if self.use_second_attention: #pad all with zeros new_tensor = torch.cuda.FloatTensor(1, 3, self.hidden_dim_slow) padded_moderate = F.pad(pad_attn_moderate, pad=(0, self.hidden_dim_slow - self.hidden_dim_moderate)) padded_fast = F.pad(pad_attn_fast, pad=(0, self.hidden_dim_slow - self.hidden_dim_fast)) new_tensor[:, 0, :] = pad_attn_slow new_tensor[:, 1, :] = padded_moderate new_tensor[:, 2, :] = padded_fast pad_attn = self.final_attn( (new_tensor, autograd.Variable(torch.cuda.LongTensor( [3])))) #length of 3 always because fast,slow,moderate else: # Concatenate slow, moderate and fast pad_attn = torch.cat( (pad_attn_slow, pad_attn_moderate, pad_attn_fast), 1) #concat to be 1x3*hidden_dim # Pass through FC layer and Softmax tag_space = self.hidden2tag(pad_attn) tag_score = F.log_softmax(tag_space, dim=1) # Return predictions return tag_score
def init_hidden(self, batch_size): # h,c shape: [num_layers * num_directions, batch, hidden_size] return (autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)), autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))
if teacher_forcing or True: enc_loss += criterion( pred_c, autograd.Variable(torch.LongTensor([input_c_encoded]))) prev_c = input_c_encoded # prev_c_encoded = autograd.Variable( # torch.from_numpy(np.array([input_c_encoded], np.int32)).long().view(1, 1) # ) if n <= 4 and epoch % print_every == 0: if n == 0: encoder_debug += 'epoch %s encoder:\n' % epoch encoder_debug += ' [%s] => [%s]\n' % (input_sentence_verify, sentence) return state, enc_loss state = autograd.Variable(torch.zeros(1, 1, hidden_size)) state, enc_loss = encode(input_encoded, state) loss += enc_loss # decode if False: prev_c_encoded = autograd.Variable( torch.from_numpy(np.array([encoding.start_code], np.int32)).long().view(1, 1)) output_sentence = '' for t, target_c_encoded in enumerate(target_encoded[1:]): # this is going to correspond approximately to # 'teacher forcing' in the seq2seq example # on the pytorch website prev_c_embedded = embedding(prev_c_encoded)
def make_var(np_array, requires_grad=False): tensor = torch.from_numpy(np_array.astype(np.float32)) return autograd.Variable(tensor, requires_grad=requires_grad)
def batchify_with_label(data, input_batch_list, input_batch_list_text, gpu): with torch.no_grad(): # feili, compatible with 0.4 batch_size = len(input_batch_list) words = [sent[0] for sent in input_batch_list] if input_batch_list_text is None: chars = [sent[1] for sent in input_batch_list] if data.feat_config is not None: if len(input_batch_list[0]) > 3: labels = [sent[2] for sent in input_batch_list] features = [np.asarray(sent[3]) for sent in input_batch_list] feature_num = len(features[0][0]) else: labels = None features = [np.asarray(sent[2]) for sent in input_batch_list] feature_num = len(features[0][0]) else: if len(input_batch_list[0]) > 2: labels = [sent[2] for sent in input_batch_list] else: labels = None word_seq_lengths = torch.LongTensor(list(map(len, words))) if input_batch_list_text is not None: if labels: words_text = [sent[3] for sent in input_batch_list_text] else: words_text = [sent[2] for sent in input_batch_list_text] max_seq_len = word_seq_lengths.max().item() word_seq_tensor = autograd.Variable( torch.zeros((batch_size, max_seq_len), dtype=torch.long)) label_seq_tensor = autograd.Variable( torch.zeros((batch_size, max_seq_len), dtype=torch.long)) if data.feat_config is not None: feature_seq_tensors = [] for idx in range(feature_num): feature_seq_tensors.append( autograd.Variable( torch.zeros((batch_size, max_seq_len), dtype=torch.long))) if input_batch_list_text is not None: words_text_tensor = [['<pad>' for col in range(max_seq_len)] for row in range(batch_size)] mask = autograd.Variable( torch.zeros((batch_size, max_seq_len), dtype=torch.uint8)) if labels: for idx, (seq, label, seqlen) in enumerate(zip(words, labels, word_seq_lengths)): word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq) label_seq_tensor[idx, :seqlen] = torch.LongTensor(label) mask[idx, :seqlen] = torch.Tensor([1] * seqlen.item()) if data.feat_config is not None: for idy in range(feature_num): feature_seq_tensors[idy][ idx, :seqlen] = torch.LongTensor( features[idx][:, idy]) if input_batch_list_text is not None: words_text_tensor[idx][:seqlen] = words_text[idx] else: for idx, (seq, seqlen) in enumerate(zip(words, word_seq_lengths)): word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq) mask[idx, :seqlen] = torch.Tensor([1] * seqlen.item()) if data.feat_config is not None: for idy in range(feature_num): feature_seq_tensors[idy][ idx, :seqlen] = torch.LongTensor( features[idx][:, idy]) if input_batch_list_text is not None: words_text_tensor[idx][:seqlen] = words_text[idx] word_seq_lengths, word_perm_idx = word_seq_lengths.sort( 0, descending=True) word_seq_tensor = word_seq_tensor[word_perm_idx] if data.feat_config is not None: for idx in range(feature_num): feature_seq_tensors[idx] = feature_seq_tensors[idx][ word_perm_idx] if labels: label_seq_tensor = label_seq_tensor[word_perm_idx] mask = mask[word_perm_idx] if input_batch_list_text is not None: words_text_tensor_1 = [] for i in range(batch_size): ii = word_perm_idx[i].item() words_text_tensor_1.append(words_text_tensor[ii]) char_seq_tensor = None char_seq_lengths = None char_seq_recover = None else: words_text_tensor_1 = None ### deal with char # pad_chars (batch_size, max_seq_len) pad_chars = [ chars[idx] + [[0]] * (max_seq_len - len(chars[idx])) for idx in range(len(chars)) ] length_list = [list(map(len, pad_char)) for pad_char in pad_chars] max_word_len = max(list(map(max, length_list))) char_seq_tensor = autograd.Variable( torch.zeros((batch_size, max_seq_len, max_word_len), dtype=torch.long)) char_seq_lengths = torch.LongTensor(length_list) for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)): for idy, (word, wordlen) in enumerate(zip(seq, seqlen)): # print len(word), wordlen char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word) char_seq_tensor = char_seq_tensor[word_perm_idx].view( batch_size * max_seq_len, -1) char_seq_lengths = char_seq_lengths[word_perm_idx].view( batch_size * max_seq_len, ) char_seq_lengths, char_perm_idx = char_seq_lengths.sort( 0, descending=True) char_seq_tensor = char_seq_tensor[char_perm_idx] _, char_seq_recover = char_perm_idx.sort(0, descending=False) _, word_seq_recover = word_perm_idx.sort(0, descending=False) if opt.gpu >= 0 and torch.cuda.is_available(): word_seq_tensor = word_seq_tensor.cuda(gpu) word_seq_lengths = word_seq_lengths.cuda(gpu) word_seq_recover = word_seq_recover.cuda(gpu) if labels: label_seq_tensor = label_seq_tensor.cuda(gpu) if data.feat_config is not None: for idx in range(feature_num): feature_seq_tensors[idx] = feature_seq_tensors[idx].cuda( gpu) if input_batch_list_text is None: char_seq_tensor = char_seq_tensor.cuda(gpu) char_seq_recover = char_seq_recover.cuda(gpu) mask = mask.cuda(gpu) if labels: if data.feat_config is not None: return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask, feature_seq_tensors, words_text_tensor_1 else: return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask, None, words_text_tensor_1 else: if data.feat_config is not None: return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, None, mask, feature_seq_tensors, words_text_tensor_1 else: return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, None, mask, None, words_text_tensor_1