def action(self, scene_logprob, sample=True): # action prob type_action_prob = self.type_planner.action_prob( scene_logprob.type_logprob) size_action_prob = self.size_planner.action_prob( scene_logprob.size_logprob) # get action type_action, type_action_logprob = sample_action(type_action_prob, sample=sample) size_action, size_action_logprob = sample_action(size_action_prob, sample=sample) action = (None, None, None, type_action, size_action, None) action_logprob = (None, None, None, type_action_logprob, size_action_logprob, None) return action, action_logprob, (None, type_action_prob, size_action_prob, None)
def epsilon_greedy_action(q_sa, s, epsilon): a_best = greedy_action(q_sa, s) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) return sample_action(selection_probs)
def rollout(self, net): """ rollout handles the actual rollout of the environment for n steps in time. net - torch Module object. This is the model to interact with the environment. """ net.eval() state = next_state(self.env, self.obs_deque, obs=None, reset=True, preprocess=self.hyps['preprocess']) ep_rew = 0 hyps = self.hyps is_recurrent = hasattr(net, "fresh_h") if not is_recurrent: h = None else: h = net.fresh_h() t = 0 episode_count = 1 while t <= 400: t += 1 state = cuda_if(torch.FloatTensor(state)) if is_recurrent: val, logits, h = net(state[None], h=cuda_if(h.detach().data)) else: val, logits = net(state[None]) if self.hyps['discrete_env']: probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) else: mu, sig = logits action = mu + torch.randn_like(sig) * sig action = action.cpu().detach().numpy().squeeze() if len(action.shape) == 0: action = np.asarray([float(action)]) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() ep_rew += rew reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: episode_count += 1 state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) return ep_rew / episode_count, ep_rew / t
def e_greedy(s, w, epsilon=0.05): a_best = greedy(s, w) selection_probs = [] default_p = epsilon / len(Action) for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + default_p) else: selection_probs.append(default_p) return sample_action(selection_probs)
def mc_control(num_episodes=10000): q_sa = {} p = {} n_s = {} n_sa = {} n0 = 100 for _ in range(num_episodes): state = State() reward = 0 episode_s = [] episode_sa = [] while not state.terminal: s = state.as_tuple() if s in p: a = sample_action(p[s]) else: a = Action.random() episode_s.append(s) episode_sa.append(s + (a, )) state, reward = step(state, a) ns = n_s.get(s, 0) n_s[s] = ns + 1 sa = s + (a, ) nsa = n_sa.get(sa, 0) n_sa[sa] = nsa + 1 # GLIE MC Control for sa in set(episode_sa): nsa = n_sa[sa] qsa = q_sa.get(sa, 0) q_sa[sa] = qsa + ((reward - qsa) / nsa) # Improve policy for s in set(episode_s): a_best = greedy_action(q_sa, s) ns = n_s.get(s, 0) epsilon = n0 / (n0 + ns) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) p[s] = selection_probs return q_sa
def sample_action(self, net, obs, obs_var, action_var, exploration, step, explore=False, testing=False): if random.random() <= 1 - exploration.value(step) or not explore: if self.args.use_guidance: # sample action distribution p obs = Variable( torch.from_numpy( np.expand_dims(obs.transpose(2, 0, 1), axis=0)).float()) / 255.0 if torch.cuda.is_available(): obs = obs.cuda() with torch.no_grad(): obs = obs.repeat(max(1, torch.cuda.device_count()), 1, 1, 1) self.p = net(obs, function='guide_action')[0] p = F.softmax(self.p / self.args.temperature, dim=-1).data.cpu().numpy() else: p = None action = sample_action(self.args, p, net, obs_var, self.guides, action_var=action_var, testing=testing) else: action = np.random.rand(self.args.num_total_act) * 2 - 1 action = np.clip(action, -1, 1) guide_act = get_guide_action(self.args.bin_divide, action) self.prev_act = action return action, guide_act
def action(self, scene_logprob, sample=True): # action prob pos_action_prob, pos_unnorm = self.pos_planner.action_prob( scene_logprob.position_logprob) num_action_prob, num_unnorm = self.num_planner.action_prob( scene_logprob.number_logprob) type_action_prob = self.type_planner.action_prob( scene_logprob.type_logprob) size_action_prob = self.size_planner.action_prob( scene_logprob.size_logprob) color_action_prob = self.color_planner.action_prob( scene_logprob.color_logprob) pos_num_select_prob = normalize( torch.cat([sum(pos_unnorm), sum(num_unnorm)], dim=-1))[0] pos_num_action_prob = torch.cat([ pos_action_prob * pos_num_select_prob[:, 0].unsqueeze(-1), num_action_prob * pos_num_select_prob[:, 1].unsqueeze(-1) ], dim=-1) # get action pos_num_select, pos_num_select_logprob = sample_action( pos_num_select_prob, sample=sample) pos_action, pos_action_logprob = sample_action(pos_action_prob, sample=sample) num_action, num_action_logprob = sample_action(num_action_prob, sample=sample) type_action, type_action_logprob = sample_action(type_action_prob, sample=sample) size_action, size_action_logprob = sample_action(size_action_prob, sample=sample) color_action, color_action_logprob = sample_action(color_action_prob, sample=sample) action = (pos_num_select, pos_action, num_action, type_action, size_action, color_action) action_logprob = (pos_num_select_logprob, pos_action_logprob, num_action_logprob, type_action_logprob, size_action_logprob, color_action_logprob) return action, action_logprob, (pos_num_action_prob, type_action_prob, size_action_prob, color_action_prob)
def rollout(self, net, idx, hyps): """ rollout handles the actual rollout of the environment for n steps in time. It is called from run and performs a single rollout, placing the collected data into the shared lists found in the datas dict. net - torch Module object. This is the model to interact with the environment. idx - int identification number distinguishing the portion of the shared array designated for this runner hyps - dict object with all necessary hyperparameters Keys (Assume string type keys): "gamma" - reward decay coeficient "n_tsteps" - number of steps to be taken in the environment "n_frame_stack" - number of frames to stack for creation of the mdp state "preprocess" - function to preprocess raw observations """ state = self.state_bookmark h = self.h_bookmark n_tsteps = hyps['n_tsteps'] startx = idx * n_tsteps prev_val = None for i in range(n_tsteps): self.datas['states'][startx + i] = cuda_if( torch.FloatTensor(state)) state_in = Variable(self.datas['states'][startx + i]).unsqueeze(0) if 'h_states' in self.datas: self.datas['h_states'][startx + i] = h.data[0] h_in = Variable(h.data) val, logits, h = net(state_in, h_in) else: val, logits = net(state_in) probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() self.ep_rew += rew reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew) self.ep_rew = 0 # Reset Recurrence if h is not None: h = Variable(cuda_if(torch.zeros(1, self.net.h_size))) self.datas['rewards'][startx + i] = rew self.datas['dones'][startx + i] = float(done) self.datas['actions'][startx + i] = action state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) if i > 0: prev_rew = self.datas['rewards'][startx + i - 1] prev_done = self.datas['dones'][startx + i - 1] delta = prev_rew + hyps['gamma'] * val.data * ( 1 - prev_done) - prev_val self.datas['deltas'][startx + i - 1] = delta prev_val = val.data.squeeze() # Funky bootstrapping endx = startx + n_tsteps - 1 if not done: state_in = Variable(cuda_if(torch.FloatTensor(state))).unsqueeze(0) if 'h_states' in self.datas: val, logits, _ = net(state_in, Variable(h.data)) else: val, logits = net(state_in) self.datas['rewards'][endx] += hyps['gamma'] * val.squeeze( ) # Bootstrap self.datas['dones'][endx] = 1. self.datas['deltas'][endx] = self.datas['rewards'][endx] - prev_val self.state_bookmark = state if h is not None: self.h_bookmark = h.data
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" worker = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(worker, best_network_model) best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) manager = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(manager, best_network_model) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." #train_docs_iter = DataReader.DataGnerater("train"+reduced) train_docs_iter = DataReader.DataGnerater("dev" + reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs_iter = DataReader.DataGnerater("dev" + reduced) test_docs_iter = DataReader.DataGnerater("test" + reduced) print "Performance after pretraining..." print "DEV" metric = performance.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "TEST" metric = performance.performance(test_docs_iter, worker, manager) print "Average:", metric["average"] print "***" print sys.stdout.flush() lr = nnargs["lr"] top_k = nnargs["top_k"] model_save_dir = "./model/reinforce/" utils.mkdir(model_save_dir) score_softmax = nn.Softmax() optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6) optimizer_worker = optim.RMSprop(worker.parameters(), lr=lr, eps=1e-6) MAX_AVE = 2048 for echo in range(nnargs["epoch"]): start_time = timeit.default_timer() print "Pretrain Epoch:", echo reward_log = Logger(Tensorboard + args.tb + "/acl2018/%d/reward/" % echo, flush_secs=3) entropy_log_manager = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/worker" % echo, flush_secs=3) entropy_log_worker = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/manager" % echo, flush_secs=3) #train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl') train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl') docs_by_id = {doc.did: doc for doc in train_docs} ave_reward = [] ave_manager_entropy = [] ave_worker_entropy = [] print >> sys.stderr, "Link docs ..." tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 predict_action_embedding = [] choose_action = [] mid = 1 step = 0 statistic = { "worker_hits": 0, "manager_hits": 0, "total": 0, "manager_predict_last": 0, "worker_predict_last": 0 } for data in train_docs_iter.rl_case_generater(shuffle=True): rl = data["rl"] scores_manager, representations_manager = get_score_representations( manager, data) for s, e in zip(rl["starts"], rl["ends"]): action_embeddings = representations_manager[s:e] probs = F.softmax(torch.transpose(scores_manager[s:e], 0, 1)) m = Categorical(probs) this_action = m.sample() index = this_action.data.cpu().numpy()[0] if index == (e - s - 1): should_cluster = current_new_cluster cluster_info[should_cluster] = [] current_new_cluster += 1 else: should_cluster = cluster_list[index] choose_action.append(index) cluster_info[should_cluster].append(mid) cluster_list.append(should_cluster) mid += 1 cluster_indexs = torch.cuda.LongTensor( cluster_info[should_cluster]) action_embedding_predict = torch.mean( action_embeddings[cluster_indexs], 0, keepdim=True) predict_action_embedding.append(action_embedding_predict) tmp_data.append(data) if rl["end"] == True: inside_index = 0 manager_path = [] worker_path = [] doc = docs_by_id[rl["did"]] for data in tmp_data: rl = data["rl"] pair_target = data["pair_target"] anaphoricity_target = 1 - data["anaphoricity_target"] target = numpy.concatenate( (pair_target, anaphoricity_target))[rl["reindex"]] scores_worker, representations_worker = get_score_representations( worker, data) for s, e in zip(rl["starts"], rl["ends"]): action_embeddings = representations_worker[s:e] score = score_softmax( torch.transpose(scores_worker[s:e], 0, 1)).data.cpu().numpy()[0] action_embedding_choose = predict_action_embedding[ inside_index] similarities = torch.sum( torch.abs(action_embeddings - action_embedding_choose), 1) similarities = similarities.data.cpu().numpy() action_probabilities = [] action_list = [] action_candidates = heapq.nlargest( top_k, -similarities) for action in action_candidates: action_index = numpy.argwhere( similarities == -action)[0][0] action_probabilities.append(score[action_index]) action_list.append(action_index) manager_action = choose_action[inside_index] if not manager_action in action_list: action_list.append(manager_action) action_probabilities.append(score[manager_action]) this_target = target[s:e] manager_action = choose_action[inside_index] sample_action = utils.sample_action( numpy.array(action_probabilities)) worker_action = action_list[sample_action] if this_target[worker_action] == 1: statistic["worker_hits"] += 1 if this_target[manager_action] == 1: statistic["manager_hits"] += 1 if worker_action == (e - s - 1): statistic["worker_predict_last"] += 1 if manager_action == (e - s - 1): statistic["manager_predict_last"] += 1 statistic["total"] += 1 inside_index += 1 #link = manager_action link = worker_action m1, m2 = rl['ids'][s + link] doc.link(m1, m2) manager_path.append(manager_action) worker_path.append(worker_action) reward = doc.get_f1() for data in tmp_data: for s, e in zip(rl["starts"], rl["ends"]): ids = rl['ids'][s:e] ana = ids[0, 1] old_ant = doc.ana_to_ant[ana] doc.unlink(ana) costs = rl['costs'][s:e] for ant_ind in range(e - s): costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1) doc.link(old_ant, ana) #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) inside_index = 0 worker_entropy = 0.0 for data in tmp_data: new_step = step # worker scores_worker, representations_worker = get_score_representations( worker, data, dropout=nnargs["dropout_rate"]) optimizer_worker.zero_grad worker_loss = None for s, e in zip(rl["starts"], rl["ends"]): costs = rl['costs'][s:e] costs = autograd.Variable( torch.from_numpy(costs).type( torch.cuda.FloatTensor)) action = worker_path[inside_index] score = F.softmax( torch.transpose(scores_worker[s:e], 0, 1)) if not score.size()[1] == costs.size()[0]: continue score = torch.squeeze(score) baseline = torch.sum(costs * score) this_cost = torch.log( score[action]) * -1.0 * (reward - baseline) if worker_loss is None: worker_loss = this_cost else: worker_loss += this_cost worker_entropy += torch.sum( score * torch.log(score + 1e-7) ).data.cpu().numpy()[ 0] #+ 0.001*torch.sum(score*torch.log(score+1e-7)) inside_index += 1 worker_loss.backward() torch.nn.utils.clip_grad_norm(worker.parameters(), nnargs["clip"]) optimizer_worker.step() ave_worker_entropy.append(worker_entropy) if len(ave_worker_entropy) >= MAX_AVE: ave_worker_entropy = ave_worker_entropy[1:] entropy_log_worker.log_value( 'entropy', float(sum(ave_worker_entropy)) / float(len(ave_worker_entropy)), new_step) new_step += 1 inside_index = 0 manager_entropy = 0.0 for data in tmp_data: new_step = step rl = data["rl"] ave_reward.append(reward) if len(ave_reward) >= MAX_AVE: ave_reward = ave_reward[1:] reward_log.log_value( 'reward', float(sum(ave_reward)) / float(len(ave_reward)), new_step) scores_manager, representations_manager = get_score_representations( manager, data, dropout=nnargs["dropout_rate"]) optimizer_manager.zero_grad manager_loss = None for s, e in zip(rl["starts"], rl["ends"]): score = F.softmax( torch.transpose(scores_manager[s:e], 0, 1)) costs = rl['costs'][s:e] costs = autograd.Variable( torch.from_numpy(costs).type( torch.cuda.FloatTensor)) if not score.size()[1] == costs.size()[0]: continue action = manager_path[inside_index] score = torch.squeeze(score) baseline = torch.sum(costs * score) this_cost = torch.log(score[action]) * -1.0 * ( reward - baseline ) # + 0.001*torch.sum(score*torch.log(score+1e-7)) #this_cost = torch.sum(score*costs) + 0.001*torch.sum(score*torch.log(score+1e-7)) if manager_loss is None: manager_loss = this_cost else: manager_loss += this_cost manager_entropy += torch.sum( score * torch.log(score + 1e-7)).data.cpu().numpy()[0] inside_index += 1 manager_loss.backward() torch.nn.utils.clip_grad_norm(manager.parameters(), nnargs["clip"]) optimizer_manager.step() ave_manager_entropy.append(manager_entropy) if len(ave_manager_entropy) >= MAX_AVE: ave_manager_entropy = ave_manager_entropy[1:] entropy_log_manager.log_value( 'entropy', float(sum(ave_manager_entropy)) / float(len(ave_manager_entropy)), new_step) new_step += 1 step = new_step tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 mid = 1 predict_action_embedding = [] choose_action = [] end_time = timeit.default_timer() print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "save model ..." #print "Top k",top_k print "Worker Hits", statistic[ "worker_hits"], "Manager Hits", statistic[ "manager_hits"], "Total", statistic["total"] print "Worker predict last", statistic[ "worker_predict_last"], "Manager predict last", statistic[ "manager_predict_last"] #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo) #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo) print "DEV" metric = performance.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "DEV manager" metric = performance_manager.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] print "TEST" metric = performance.performance(test_docs_iter, worker, manager) print "Average:", metric["average"] print sys.stdout.flush()
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best.top.pair" print >> sys.stderr,"Read model from ",best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda() net_copy(network_model,best_network_model) best_network_file = "./model/network_model_pretrain.best.top.ana" print >> sys.stderr,"Read model from ",best_network_file best_network_model = torch.load(best_network_file) ana_network = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda() net_copy(ana_network,best_network_model) reduced="" if args.reduced == 1: reduced="_reduced" print >> sys.stderr,"prepare data for train ..." train_docs_iter = DataReader.DataGnerater("train"+reduced) print >> sys.stderr,"prepare data for dev and test ..." dev_docs_iter = DataReader.DataGnerater("dev"+reduced) test_docs_iter = DataReader.DataGnerater("test"+reduced) print "Performance after pretraining..." print "DEV" metric = performance.performance(dev_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "TEST" metric = performance.performance(test_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "***" print sys.stdout.flush() l2_lambda = 1e-6 #lr = 0.00001 #lr = 0.000005 lr = 0.000002 #lr = 0.0000009 dropout_rate = 0.5 shuffle = True times = 0 reinforce = True model_save_dir = "./model/reinforce/" utils.mkdir(model_save_dir) score_softmax = nn.Softmax() optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps = 1e-6) ana_optimizer = optim.RMSprop(ana_network.parameters(), lr=lr, eps = 1e-6) scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5) ana_scheduler = lr_scheduler.StepLR(ana_optimizer, step_size=15, gamma=0.5) for echo in range(30): start_time = timeit.default_timer() print "Pretrain Epoch:",echo scheduler.step() ana_scheduler.step() train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl') docs_by_id = {doc.did: doc for doc in train_docs} print >> sys.stderr,"Link docs ..." tmp_data = [] path = [] for data in train_docs_iter.rl_case_generater(shuffle=True): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,0.0) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents, 0.0) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex] ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex] doc = docs_by_id[rl['did']] for s,e in zip(rl["starts"],rl["ends"]): score = score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)).data.cpu().numpy()[0] pair_score = score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)).data.cpu().numpy()[0] ana_action = utils.sample_action(score) if ana_action == (e-s-1): action = ana_action else: pair_action = utils.sample_action(pair_score*score[:-1]) action = pair_action path.append(action) link = action m1, m2 = rl['ids'][s + link] doc.link(m1, m2) tmp_data.append((mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return)) if rl["end"] == True: doc = docs_by_id[rl['did']] reward = doc.get_f1() inside_index = 0 for mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return in tmp_data: for (start, end) in zip(rl['starts'], rl['ends']): ids = rl['ids'][start:end] ana = ids[0, 1] old_ant = doc.ana_to_ant[ana] doc.unlink(ana) costs = rl['costs'][start:end] for ant_ind in range(end - start): costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1) doc.link(old_ant, ana) cost = 0.0 mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex] ana_optimizer.zero_grad() ana_loss = None i = inside_index for s,e in zip(rl["starts"],rl["ends"]): costs = rl["costs"][s:e] costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) score = torch.squeeze(score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1))) baseline = torch.sum(score*costs) action = path[i] this_cost = torch.log(score[action])*-1.0*(reward-baseline) if ana_loss is None: ana_loss = this_cost else: ana_loss += this_cost i += 1 ana_loss.backward() torch.nn.utils.clip_grad_norm(ana_network.parameters(), 5.0) ana_optimizer.step() mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor)) mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor)) output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate) ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex] pair_loss = None optimizer.zero_grad() i = inside_index index = 0 for s,e in zip(rl["starts"],rl["ends"]): action = path[i] if (not (action == (e-s-1))) and (anaphoricity_target[index] == 1): costs = rl["costs"][s:e-1] costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) score = torch.squeeze(score_softmax(torch.transpose(scores_reindex[s:e-1],0,1))) baseline = torch.sum(score*costs) this_cost = torch.log(score[action])*-1.0*(reward-baseline) if pair_loss is None: pair_loss = this_cost else: pair_loss += this_cost i += 1 index += 1 if pair_loss is not None: pair_loss.backward() torch.nn.utils.clip_grad_norm(network_model.parameters(), 5.0) optimizer.step() inside_index = i tmp_data = [] path = [] end_time = timeit.default_timer() print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time) print >> sys.stderr, "cost:",cost print >> sys.stderr,"save model ..." torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo) torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo) print "DEV" metric = performance.performance(dev_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "DEV Ana: ",metric["ana"] print "TEST" metric = performance.performance(test_docs_iter,network_model,ana_network) print "Average:",metric["average"] print "TEST Ana: ",metric["ana"] print sys.stdout.flush()
dqn_heads = [] act_index = np.zeros(num_heads, dtype=int) epsilon_eval = 0.05 done = False for i in range(num_heads): the_model = torch.load('./tmp/model_epoch_200/model' + str(i) + '.pth') dqn_heads.append(DQN(num_actions)) dqn_heads[i].load_state_dict(the_model) #dqn = DQN() #dqn.load_state_dict(the_model) with torch.no_grad(): var_phi = autograd.Variable(torch.Tensor(1, 4, 84, 84)) while (not done): for i in range(num_heads): act_index[i] = sample_action(atari, dqn_heads[i], var_phi, epsilon=epsilon_eval, num_actions) act_index_vote = np.bincount(act_index).argmax() phi_next, r, done = atari.step(VALID_ACTION[act_index_vote]) atari.display() time.sleep(0.01)
from model import DQN from pong import Pong import torch import torch.autograd as autograd from utils import sample_action the_model = torch.load('./tmp/model.pth') dqn = DQN() dqn.load_state_dict(the_model) pong = Pong() done = False VALID_ACTION = [0, 2, 5] var_phi = autograd.Variable(torch.Tensor(1, 4, 84, 84), volatile=True) while(not done): act_index = sample_action(pong, dqn, var_phi) phi_next, r, done = pong.step(VALID_ACTION[act_index]) pong.display()
def train(self): self.memoryInit() print("================\n" "Start training!!\n" "================") # self.env.reset(newWorkflow=True) self.env.reset2() epoch = 0 update_count = 0 score = 0. avg_score = 0. best_score = 0. t = time.time() SCORE = [] QVALUE = [] QVALUE_MEAN = [] QVALUE_STD = [] while (epoch < self.max_epoch): ep = self.epsilon_end + (self.epsilon - self.epsilon_end) * math.exp( -1. * epoch / self.epsilon_decay) done = False actions = [] while not done: self.optimz.zero_grad() taskNos = self.env.getNewTasks() if len(taskNos) == 0: self.env.spanTimeProcess() else: for taskNo in taskNos: action = sample_action(self.env, self.dqn, self.var_phi, ep) vmPerm = self.env.scheduleTask(taskNo, action) actions.append(action) ob = self.env.getObservation(vmPerm=vmPerm) done, r = self.env.isDone() self.MP.put((ob, action, r, done)) self.env.spanTimeProcess() done, r = self.env.isDone() if done: # print("action: ", actions) ob = self.env.getObservation() self.MP.put((ob, action, r, done)) score += r # batch sample from memory to train batch_phi, batch_a, batch_r, batch_phi_next, batch_done = self.MP.batch( ) self.var_batch_phi_next.data.copy_( torch.from_numpy(batch_phi_next)) batch_target_q, _ = self.target_dqn( self.var_batch_phi_next).max(dim=1) mask_index = np.ones((self.batch_size, 1)) mask_index[batch_done] = 0.0 self.var_batch_r_mask.data.copy_(torch.from_numpy(mask_index)) self.var_batch_r.data.copy_(torch.from_numpy(batch_r)) y = self.var_batch_r + batch_target_q.mul(self.GAMMA).mul( self.var_batch_r_mask) y = y.detach() self.var_batch_phi.data.copy_(torch.from_numpy(batch_phi)) batch_q = self.dqn(self.var_batch_phi) self.var_batch_a.data.copy_( torch.from_numpy(batch_a).long().view(-1, 1)) batch_q = batch_q.gather(1, self.var_batch_a) loss = y.sub(batch_q).pow(2).mean() loss.backward() self.optimz.step() update_count += 1 if update_count == self.update_step: self.target_dqn.load_state_dict(self.dqn.state_dict()) update_count = 0 QVALUE.append(batch_q.data.cpu().numpy().mean()) SCORE.append(score) QVALUE_MEAN.append(np.mean(QVALUE)) QVALUE_STD.append(np.std(QVALUE)) QVALUE = [] epoch += 1 avg_score = 0.9 * avg_score + 0.1 * score #print(actions) #print(avg_score, self.env.currentTime, self.env.workflow.DeadLine) print(avg_score) score = 0.0 # self.env.reset(newWorkflow=True) self.env.reset2() time_elapse = time.time() - t #if avg_score >= best_score and time_elapse > 60: # if avg_score >= best_score: # torch.save(self.dqn.state_dict(), self.save_path) # print('Model has been saved.') # best_score = avg_score # t = time.time() torch.save(self.dqn.state_dict(), self.save_path) print('Model has been saved.')
def rollout(self, net, idx, hyps): """ rollout handles the actual rollout of the environment for n steps in time. It is called from run and performs a single rollout, placing the collected data into the shared lists found in the datas dict. net - torch Module object. This is the model to interact with the environment. idx - int identification number distinguishing the portion of the shared array designated for this runner hyps - dict object with all necessary hyperparameters Keys (Assume string type keys): "gamma" - reward decay coeficient "n_tsteps" - number of steps to be taken in the environment "n_frame_stack" - number of frames to stack for creation of the mdp state "preprocess" - function to preprocess raw observations """ net.eval() hyps = self.hyps state = self.state_bookmark n_tsteps = hyps['n_tsteps'] is_recurrent = hasattr(net, "fresh_h") if not is_recurrent: h = None else: h = self.prev_h if self.prev_h is not None else net.fresh_h() startx = idx * n_tsteps for i in range(n_tsteps): self.datas['states'][startx + i] = cuda_if( torch.FloatTensor(state)) if is_recurrent: self.datas["hs"][startx + i] = cuda_if(h.detach().data) val, logits, h = net(self.datas['states'][startx + i][None], h=self.datas['hs'][startx + i][None]) self.datas["next_hs"][startx + i] = cuda_if(h.detach().data) else: val, logits = net(self.datas['states'][startx + i][None]) if self.hyps['discrete_env']: probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) else: mu, sig = logits action = mu + torch.randn_like(sig) * sig action = action.cpu().detach().numpy().squeeze() if len(action.shape) == 0: action = np.asarray([float(action)]) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() self.ep_rew += rew self.datas['rews'][startx + i] = float(rew) reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew) self.ep_rew = 0 self.datas['dones'][startx + i] = 0 if isinstance(action, np.ndarray): action = cuda_if(torch.from_numpy(action)) self.datas['actions'][startx + i] = action state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) if i > 0: self.datas['next_states'][startx + i - 1] = self.datas['states'][startx + i] endx = startx + n_tsteps - 1 self.datas['next_states'][endx] = cuda_if(torch.FloatTensor(state)) self.datas['dones'][endx] = 1. self.state_bookmark = state if h is not None: self.prev_h = h.data
t = time.time() t0 = time.time() SCORE = [] SCORE_EVAL = [] QVALUE = [] QVALUE_MEAN = [] QVALUE_STD = [] STEP_COUNT = [] while (epoch < max_epoch): k_head = np.random.randint(num_heads, size=1)[0] while (not done): act_index = sample_action(atari, dqn_heads[k_head], var_phi, epsilon, num_actions) if use_eGreedy == 1: epsilon = (epsilon - 1e-6) if epsilon > 0.1 else 0.1 elif use_eGreedy == 0: epsilon = 0.0 phi_next, r, done = atari.step(VALID_ACTION[act_index]) # atari.display() MP.put((phi_next, act_index, r, done)) r = np.clip(r, -1, 1) score += r for i in range(num_heads): optimz_heads[i].zero_grad()
RL_fail = 0 RL_cost = [] Random_fail = 0 Random_cost = [] for i in range(100): RL_actions = [] He_actions = [] while True: taskNos = env.getNewTasks() if len(taskNos) == 0: env.spanTimeProcess() done, r = env.isDone() else: for taskNo in taskNos: vmType = sample_action(env, dqn, var_phi, epsilon=0) RL_actions.append(vmType) # vmType = np.random.randint(0,3) done, reward = env.step(taskNo, vmType) if done: print('RL_actions: ', RL_actions) if r < 0: RL_fail += 1 else: RL_cost.append(env.totalCost) break env.reset(newWorkflow=False) while True: taskNos = env.getNewTasks()
def main(): DIR = args.DIR embedding_file = args.embedding_dir #network_file = "./model/model.pkl" #network_file = "./model/pretrain/network_model_pretrain.20" network_file = "./model/pretrain/network_model_pretrain.top.best" if os.path.isfile(network_file): print >> sys.stderr, "Read model from ./model/model.pkl" network_model = torch.load(network_file) else: embedding_matrix = numpy.load(embedding_file) #print len(embedding_matrix) "Building torch model" network_model = network.Network(pair_feature_dimention, mention_feature_dimention, word_embedding_dimention, span_dimention, 1000, embedding_size, embedding_dimention, embedding_matrix).cuda() print >> sys.stderr, "save model ..." torch.save(network_model, network_file) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." train_docs = DataReader.DataGnerater("train" + reduced) #train_docs = DataReader.DataGnerater("dev"+reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs = DataReader.DataGnerater("dev" + reduced) #test_docs = DataReader.DataGnerater("test"+reduced) l2_lambda = 1e-6 lr = 0.00002 dropout_rate = 0.5 shuffle = True times = 0 best_thres = 0.5 reinforce = True model_save_dir = "./model/pretrain/" metrics = performance.performance(dev_docs, network_model) p, r, f = metrics["b3"] f_b = [f] #for echo in range(30,200): for echo in range(20): start_time = timeit.default_timer() print "Pretrain Epoch:", echo #if echo == 100: # lr = lr/2.0 #if echo == 150: # lr = lr/2.0 #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda) #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda) cost = 0.0 optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5, weight_decay=l2_lambda) pair_cost_this_turn = 0.0 ana_cost_this_turn = 0.0 pair_nums = 0 ana_nums = 0 pos_num = 0 neg_num = 0 inside_time = 0.0 score_softmax = nn.Softmax() cluster_info = [] new_cluster_num = 0 cluster_info.append(-1) action_list = [] new_cluster_info = [] tmp_data = [] #for data in train_docs.rl_case_generater(): for data in train_docs.rl_case_generater(shuffle=True): inside_time += 1 this_doc = train_docs tmp_data.append(data) mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data gold_chain = this_doc.gold_chain[rl["did"]] gold_dict = {} for chain in gold_chain: for item in chain: gold_dict[item] = chain mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) output, pair_score = network_model.forward_all_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, dropout_rate) ana_output, ana_score = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) reindex = autograd.Variable( torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor)) scores_reindex = torch.transpose( torch.cat((pair_score, ana_score), 1), 0, 1)[reindex] #scores_reindex = torch.transpose(torch.cat((pair_score,-1-0.3*ana_score),1),0,1)[reindex] for s, e in zip(rl["starts"], rl["ends"]): #action_prob: scores_reindex[s:e][1] score = score_softmax( torch.transpose(scores_reindex[s:e], 0, 1)).data.cpu().numpy()[0] this_action = utils.sample_action(score) #this_action = ac_list.index(max(score.tolist())) action_list.append(this_action) if this_action == len(score) - 1: should_cluster = new_cluster_num new_cluster_num += 1 new_cluster_info.append(1) else: should_cluster = cluster_info[this_action] new_cluster_info.append(0) cluster_info.append(should_cluster) if rl["end"] == True: ev_document = utils.get_evaluation_document( cluster_info, this_doc.gold_chain[rl["did"]], candi_ids_return, new_cluster_num) p, r, f = evaluation.evaluate_documents([ev_document], evaluation.b_cubed) trick_reward = utils.get_reward_trick(cluster_info, gold_dict, new_cluster_info, action_list, candi_ids_return) #reward = f + trick_reward average_f = float(sum(f_b)) / len(f_b) reward = (f - average_f) * 10 f_b.append(f) if len(f_b) > 128: f_b = f_b[1:] index = 0 for data in tmp_data: mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type( torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type( torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type( torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type( torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type( torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type( torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) rl_costs = autograd.Variable( torch.from_numpy(rl["costs"]).type( torch.cuda.FloatTensor)) rl_costs = torch.transpose(rl_costs, 0, 1) output, pair_score = network_model.forward_all_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, dropout_rate) ana_output, ana_score = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) reindex = autograd.Variable( torch.from_numpy(rl["reindex"]).type( torch.cuda.LongTensor)) optimizer.zero_grad() loss = None scores_reindex = torch.transpose( torch.cat((pair_score, ana_score), 1), 0, 1)[reindex] #scores_reindex = torch.transpose(torch.cat((pair_score,-1-0.3*ana_score),1),0,1)[reindex] for s, e in zip(rl["starts"], rl["ends"]): #action_prob: scores_reindex[s:e][1] this_action = action_list[index] #current_reward = reward + trick_reward[index] current_reward = reward #this_loss = -reward*(torch.transpose(F.log_softmax(torch.transpose(scores_reindex[s:e],0,1)),0,1)[this_action]) this_loss = -current_reward * (torch.transpose( F.log_softmax( torch.transpose(scores_reindex[s:e], 0, 1)), 0, 1)[this_action]) if loss is None: loss = this_loss else: loss += this_loss index += 1 #loss /= len(rl["starts"]) loss /= len(rl["starts"]) #loss = loss/train_docs.scale_factor ## policy graident cost += loss.data[0] loss.backward() optimizer.step() new_cluster_num = 0 cluster_info = [] cluster_info.append(-1) tmp_data = [] action_list = [] new_cluster_info = [] #if inside_time%50 == 0: # performance.performance(dev_docs,network_model) # print # sys.stdout.flush() end_time = timeit.default_timer() print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "cost:", cost #print >> sys.stderr,"save model ..." #torch.save(network_model, model_save_dir+"network_model_pretrain.%d"%echo) performance.performance(dev_docs, network_model) sys.stdout.flush()
best_score = -21.0 t = time.time() SCORE = [] QVALUE = [] QVALUE_MEAN = [] QVALUE_STD = [] while (epoch < max_epoch): while (not done): optimz.zero_grad() act_index = sample_action(pong, dqn, var_phi, epsilon) epsilon = (epsilon - 1e-6) if epsilon > 0.1 else 0.1 phi_next, r, done = pong.step(VALID_ACTION[act_index]) pong.display() MP.put((phi_next, act_index, r, done)) r = np.clip(r, -1, 1) score += r # batch sample from memory to train batch_phi, batch_a, batch_r, batch_phi_next, batch_done = MP.batch() var_batch_phi_next.data.copy_(torch.from_numpy(batch_phi_next)) batch_target_q, _ = target_dqn(var_batch_phi_next).max(dim=1) mask_index = np.ones((batch_size, 1))
def q_learning(env, crosses=True, n_episodes=200000, eps=0.01, gamma=1., alpha=0.05, logging_freq=10000, estimate_episodes=100000): Q = defaultdict(lambda: np.zeros(env.n_rows * env.n_cols)) rewards = [] for i in range(1, n_episodes + 1): if not i % logging_freq: pi = compute_policy(Q) cur_reward = estimate_return(env, pi, n_episodes=estimate_episodes) rewards.append(cur_reward) print(i, cur_reward) env.reset() done = False s = env.getHash() a, a_int = None, None while not done: if crosses: # crosses: our move s = env.getHash() a = sample_action(env, Q, eps) a_int = env.int_from_action(a) s_prime, r, done, _ = env.step(a) if done: # we won Q[s][a_int] += alpha * (r - Q[s][a_int]) break # naughts: their move s_prime, r, done = random_move(env) if done: # we lost Q[s][a_int] += alpha * (-r - Q[s][a_int]) break Q[s][a_int] += alpha * (r + gamma * max(Q[s_prime[0]]) - Q[s][a_int]) else: # crosses: their move s_prime, r, done = random_move(env) if done: # we lost Q[s][a_int] += alpha * (-r - Q[s][a_int]) break if a_int is not None: # not out first move Q[s][a_int] += alpha * (r + gamma * max(Q[s_prime[0]]) - Q[s][a_int]) # naughts: our move a = sample_action(env, Q, eps) a_int = env.int_from_action(a) s_prime, r, done, _ = env.step(a) if done: # we won Q[s][a_int] += alpha * (r - Q[s][a_int]) break s = env.getHash() return Q, rewards