def getMaxQAndActionsIn(self, QValue, size, ava_actions): max_Q = None actions = [] for y in range(self.y_space, size - self.y_space): for x in range(self.x_space, size - self.x_space): for a in ava_actions: if max_Q == None or QValue[y][x][a] > max_Q: max_Q = QValue[y][x][a] actions = [[[x, y], a]] elif QValue[y][x][a] == max_Q: actions.append([[x, y], a]) if (self.model.time_step + 1) % 50 == 0 and size == 84: for y in range(0, size, 4): for x in range(0, size, 4): #hit_points_selected print(int(self.status.input_scr[y][x][2]), end=" ") print() for y in range(0, size, 4): for x in range(0, size, 4): print(int(QValue[y][x][0] * 10) / 10, end=" ") print() return max_Q, actions
def update_master_policy(self,rbs,disc, lr, cter): samples =random.sample(rbs,batch_size) minimaps = [] screens = [] infos = [] next_minimaps = [] next_screens = [] next_infos = [] actions = [] rewards = [] for i,[obs,_,action,_,next_obs] in enumerate(samples): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 next_minimap = np.array(next_obs.observation['minimap'], dtype=np.float32) next_minimap = np.expand_dims(U.preprocess_minimap(next_minimap), axis=0) next_screen = np.array(obs.observation['screen'], dtype=np.float32) next_screen = np.expand_dims(U.preprocess_screen(next_screen), axis=0) next_info = np.zeros([1, self.isize], dtype=np.float32) next_info[0, obs.observation['available_actions']] = 1 reward = next_obs.reward minimaps.append(minimap) screens.append(screen) infos.append(info) next_minimaps.append(next_minimap) next_screens.append(next_screen) next_infos.append(next_info) cur_action = np.zeros(num_subpolicies) cur_action[action]=1 actions.append(cur_action) rewards.append(reward) minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) next_minimaps = np.concatenate(next_minimaps, axis=0) next_screens = np.concatenate(next_screens, axis=0) next_infos = np.concatenate(next_infos, axis=0) y_batch = [] Qvalue_batch =self.sess_master.run(self.subpolicy_Q,feed_dict = {self.minimap: next_minimaps, self.screen: next_screens, self.info: next_infos}) for i in range(0, batch_size): terminal = samples[i][3] if terminal: y_batch.append(rewards[i]) else: y_batch.append(rewards[i] + disc * np.max(Qvalue_batch[i])) self.sess_master.run(self.master_train_op, feed_dict={self.minimap:minimaps, self.screen:screens, self.info:infos, self.y_input:y_batch, self.action_input:actions, self.learning_rate:lr})
def run(self): observations = [] self.rewards = [] actions = [] actions_spatial = [] actions_spatial_mask = [] available_actions = [] batch_dones = [] self.values = [] probs_spatial = [] probs = [] for _ in range(self.num_steps): observations.append(self.observation) action_ids, spatial_indexes, value, prob, prob_spatial = self.model.predict( np.asarray([self.observation]).swapaxes(0, 1), [self.available_actions_mask]) self.values.append(value) probs.append(prob) probs_spatial.append(prob_spatial) batch_dones.append(self.terminal) action, spatial_mask = self.make_action(action_ids[0], spatial_indexes[0]) actions.append(action_ids[0]) actions_spatial.append(spatial_indexes[0]) actions_spatial_mask.append(spatial_mask) available_actions.append(self.available_actions_mask) self.observation, reward, self.terminal, self.available_actions_mask = self.env.step( action) self.stats_recorder.after_step(reward=reward, terminal=self.terminal) self.rewards.append(reward) advantage_estimations = np.zeros_like(self.rewards) last_value = self.model.predict_value(self.observation)[0] for t in reversed(range(self.num_steps)): if t == self.num_steps - 1: self.advantage_estimation = self.estimate_advantage( t, self.terminal, last_value) else: self.advantage_estimation = self.estimate_advantage( t, batch_dones[t + 1], self.values[t + 1]) advantage_estimations[t] = self.advantage_estimation observations = np.asarray(observations).swapaxes(0, 1) return observations, \ actions, \ available_actions, \ actions_spatial, \ actions_spatial_mask,\ advantage_estimations,\ self.values,\ probs,\ probs_spatial
def multistep(self, obs): super(MineralShardsMultiAgent, self).step(obs) # Uncomment this to see the agent taking actions step-by-step. # time.sleep(0.8) # Get list of marines marines = [ unit for unit in obs.observation.feature_units if unit.alliance == _PLAYER_SELF ] # Bail if no marines if not marines: return [FUNCTIONS.no_op()] # Get list of mineral locations minerals = [[unit.x, unit.y] for unit in obs.observation.feature_units if unit.alliance == _PLAYER_NEUTRAL] # Bail if no minerals if not minerals: return [FUNCTIONS.no_op()] # Loop through marines actions = [] for marine in marines: marine_xy = [marine.x, marine.y] # Remove the previous target of the other marine from consideration other_targets = [ target_xy for (tag, target_xy) in self._marine_targets.items() if tag != marine.tag ] other_target_xy = other_targets[0] if other_targets else (-1, -1) minerals_noprevious = [ x for x in minerals if x != other_target_xy ] if len(minerals) > 1 else minerals # Find the closest mineral. distances = numpy.linalg.norm(numpy.array(minerals_noprevious) - numpy.array(marine_xy), axis=1) closest_mineral_xy = minerals_noprevious[numpy.argmin(distances)] # Update the target of this marine self._marine_targets[marine.tag.item()] = closest_mineral_xy # Make the action action = FUNCTIONS.move_unit( marine.tag.item(), "now", closest_mineral_xy ) # .item() to convert numpy.int64 to native python type (int) actions.append(action) return actions if actions else [FUNCTIONS.no_op()]
def move_drone_random_round_hatchery(self, drone_ids, pos): length = len(drone_ids) actions = [] for drone in drone_ids: action = sc_pb.Action() action.action_raw.unit_command.ability_id = self._move_ability x = pos[0] + random.randint(self._range_low, self._range_high) y = pos[1] + random.randint(self._range_low, self._range_high) action.action_raw.unit_command.target_world_space_pos.x = x action.action_raw.unit_command.target_world_space_pos.y = y action.action_raw.unit_command.unit_tags.append(drone) actions.append(action) return actions
def run(self): observations = [] rewards = [] actions = [] actions_spatial = [] actions_spatial_mask = [] available_actions = [] terminals = [] values = [] for _ in range(self.batch_size): observations.append(self.observation) action_ids, spatial_indexes, value = self.model.predict( np.asarray([self.observation]).swapaxes(0, 1), [self.available_actions_masks]) values.append(value) action, spatial_mask = self.make_action(action_ids[0], spatial_indexes[0]) actions.append(action_ids[0]) actions_spatial.append(spatial_indexes[0]) actions_spatial_mask.append(spatial_mask) available_actions.append(self.available_actions_masks) self.observation, reward, terminal, self.available_actions_masks = self.env.step( action) self.stats_recorder.after_step(reward=reward, terminal=terminal) rewards.append(reward) terminals.append(terminal) if terminals[-1] == 0: next_value = self.model.predict_value(self.observation)[0] discounted_rewards = self.discount(rewards + [next_value], terminals + [False], self.discount_rate)[:-1] else: discounted_rewards = self.discount(rewards, terminals, self.discount_rate) observations = np.asarray(observations).swapaxes(0, 1) self.model.train(observations, actions, available_actions, actions_spatial, actions_spatial_mask, discounted_rewards, values)
def prepare_training_inputs(sampled_exps, device='cpu'): states = [] actions = [] rewards = [] next_states = [] dones = [] for sampled_exp in sampled_exps: states.append(sampled_exp[0]) actions.append(sampled_exp[1]) rewards.append(sampled_exp[2]) next_states.append(sampled_exp[3]) dones.append(sampled_exp[4]) states = torch.cat(states, dim=0).float().to(device) actions = torch.cat(actions, dim=0).to(device) rewards = torch.cat(rewards, dim=0).float().to(device) next_states = torch.cat(next_states, dim=0).float().to(device) dones = torch.cat(dones, dim=0).float().to(device) return states, actions, rewards, next_states, dones
def gradient_step(self, batch_size): states = [] actions = [] rewards = [] next_states = [] length = len(self.replay_memory) for i in range(batch_size): index = random.randint(0, length - 1) length2 = len(self.replay_memory[index]) while (len(self.replay_memory[index]) == 0): index = random.randint(0, length - 1) length2 = len(self.replay_memory[index]) index2 = random.randint(0, length2 - 1) states.append(self.replay_memory[index][index2][0]) actions.append(self.replay_memory[index][index2][1]) rewards.append(self.replay_memory[index][index2][2]) next_states.append(self.replay_memory[index][index2][3]) states = np.array(states) actions = self.action_mask(actions) rewards = np.array(rewards) next_states = np.array(next_states) s = states.shape states = states.reshape((s[0], 1, s[1], s[2], s[3])) next_states = next_states.reshape((s[0], 1, s[1], s[2], s[3])) states = Variable(torch.from_numpy(states)) states = states.type(torch.DoubleTensor) next_states = Variable(torch.from_numpy(next_states)) next_states = next_states.type(torch.DoubleTensor) targets = self.compute_targets(rewards, next_states) targets = Variable(torch.from_numpy(targets)) targets = targets.type(torch.DoubleTensor) output = self.policy(states) output = torch.masked_select(output, actions) self.optimizer.zero_grad() error = self.policy.loss(output, targets) error.backward() self.optimizer.step() print("The error is " + str(error.data.numpy()[0])) return error.data.numpy()[0]
def run(self): observations = [] rewards = [] actions = [] actions_spatial = [] actions_spatial_mask = [] available_action_masks = [] terminals = [] for _ in range(self.batch_size): observations.append(self.observation) action_ids, spatial_indexes = self.model.predict( np.asarray([self.observation]).swapaxes(0, 1), [self.available_actions_mask]) action, spatial_mask = self.make_action(action_ids[0], spatial_indexes[0]) actions.append(action_ids[0]) actions_spatial.append(spatial_indexes[0]) actions_spatial_mask.append(spatial_mask) available_action_masks.append(self.available_actions_mask) self.observation, reward, terminal, self.available_actions_mask = self.env.step( action) self.stats_recorder.after_step(reward=reward, terminal=terminal) rewards.append(reward) terminals.append(terminal) rewards = self.discount(rewards, terminals, self.discount_rate) observations = np.asarray(observations).swapaxes(0, 1) self.model.train(observations=observations, actions=actions, available_actions_masks=available_action_masks, actions_spatial=actions_spatial, actions_spatial_masks=actions_spatial_mask, rewards=rewards)
def group_init_queue(player_relative): actions = [] player_x, player_y = (player_relative == _PLAYER_FRIENDLY).nonzero() # try: # # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() # actions.append({"base_action":_SELECT_ARMY, "sub7":_SELECT_ALL}) # # except Exception as e: # print(e) # for i in range(len(player_x)): # if i % 4 != 0: # continue # # xy = [player_x[i], player_y[i]] # actions.append({"base_action":_SELECT_POINT, "sub6":0, "x0":xy[0], "y0":xy[1]}) group_id = 0 group_list = [] unit_xy_list = [] for i in range(len(player_x)): if group_id > 9: break xy = [player_x[i], player_y[i]] unit_xy_list.append(xy) # 2/select_point (6/select_point_act [4]; 0/screen [84, 84]) # 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10]) if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): actions.append({ "base_action": _SELECT_POINT, "sub6": 0, "x0": xy[0], "y0": xy[1] }) else: actions.append({ "base_action": _SELECT_POINT, "sub6": 1, "x0": xy[0], "y0": xy[1] }) actions.append({ "base_action": _SELECT_CONTROL_GROUP, "sub4": _CONTROL_GROUP_SET, "sub5": group_id }) unit_xy_list = [] group_list.append(group_id) group_id += 1 if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): actions.append({ "base_action": _SELECT_POINT, "sub6": 0, "x0": xy[0], "y0": xy[1] }) else: actions.append({ "base_action": _SELECT_POINT, "sub6": 1, "x0": xy[0], "y0": xy[1] }) actions.append({ "base_action": _SELECT_CONTROL_GROUP, "sub4": _CONTROL_GROUP_SET, "sub5": group_id }) group_list.append(group_id) group_id += 1 return actions
def solve_tsp(player_relative, selected, group_list, group_id, dest_per_marine, xy_per_marine): my_dest = None other_dest = None closest, min_dist = None, None actions = [] neutral_y, neutral_x = (player_relative == 1).nonzero() player_y, player_x = (selected == 1).nonzero() #for group_id in group_list: if "0" in dest_per_marine and "1" in dest_per_marine: if group_id == 0: my_dest = dest_per_marine["0"] other_dest = dest_per_marine["1"] else: my_dest = dest_per_marine["1"] other_dest = dest_per_marine["0"] if len(player_x) > 0: if group_id == 0: xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())] else: xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())] player = xy_per_marine[str(group_id)] points = [player] for p in zip(neutral_x, neutral_y): if other_dest: dist = np.linalg.norm(np.array(other_dest) - np.array(p)) if dist < 10: # print("continue since partner will take care of it ", p) continue pp = [p[0], p[1]] if pp not in points: points.append(pp) dist = np.linalg.norm(np.array(player) - np.array(p)) if not min_dist or dist < min_dist: closest, min_dist = p, dist solve_tsp = False if my_dest: dist = np.linalg.norm(np.array(player) - np.array(my_dest)) if dist < 0.5: solve_tsp = True if my_dest is None: solve_tsp = True if len(points) < 2: solve_tsp = False if solve_tsp: # function for printing best found solution when it is found from time import clock init = clock() def report_sol(obj, s=""): print("cpu:%g\tobj:%g\ttour:%s" % \ (clock(), obj, s)) n, D = mk_matrix(points, distL2) niter = 50 tour, z = multistart_localsearch(niter, n, D) left, right = None, None for idx in tour: if tour[idx] == 0: if idx == len(tour) - 1: right = points[tour[0]] left = points[tour[idx - 1]] elif idx == 0: right = points[tour[idx + 1]] left = points[tour[len(tour) - 1]] else: right = points[tour[idx + 1]] left = points[tour[idx - 1]] left_d = np.linalg.norm(np.array(player) - np.array(left)) right_d = np.linalg.norm(np.array(player) - np.array(right)) if right_d > left_d: closest = left else: closest = right #print("optimal next :" , closest) dest_per_marine[str(group_id)] = closest #print("dest_per_marine", self.dest_per_marine) #dest_per_marine {'0': [56, 26], '1': [52, 6]} if closest: if group_id == 0: actions.append({ "base_action": group_id, "x0": closest[0], "y0": closest[1] }) else: actions.append({ "base_action": group_id, "x1": closest[0], "y1": closest[1] }) elif my_dest: if group_id == 0: actions.append({ "base_action": group_id, "x0": my_dest[0], "y0": my_dest[1] }) else: actions.append({ "base_action": group_id, "x1": my_dest[0], "y1": my_dest[1] }) else: if group_id == 0: actions.append({"base_action": 2, "x0": 0, "y0": 0}) else: actions.append({"base_action": 2, "x1": 0, "y1": 0}) # elif(len(group_list)>0): # # group_id = random.randint(0,len(group_list)-1) # actions.append({"base_action":group_id}) if group_id == 0: group_id = 1 else: group_id = 0 if "0" not in xy_per_marine: xy_per_marine["0"] = [0, 0] if "1" not in xy_per_marine: xy_per_marine["1"] = [0, 0] return actions, group_id, dest_per_marine, xy_per_marine
def solve_tsp( player_relative, selected, group_list, group_id, dest_per_marine, xy_per_marine): my_dest = None other_dest = None closest, min_dist = None, None actions = [] neutral_y, neutral_x = (player_relative == 1).nonzero() player_y, player_x = (selected == 1).nonzero() #for group_id in group_list: if "0" in dest_per_marine and "1" in dest_per_marine: if group_id == 0: my_dest = dest_per_marine["0"] other_dest = dest_per_marine["1"] else: my_dest = dest_per_marine["1"] other_dest = dest_per_marine["0"] if len(player_x) > 0: if group_id == 0: xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())] else: xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())] player = xy_per_marine[str(group_id)] points = [player] for p in zip(neutral_x, neutral_y): if other_dest: dist = np.linalg.norm(np.array(other_dest) - np.array(p)) if dist < 10: # print("continue since partner will take care of it ", p) continue pp = [p[0], p[1]] if pp not in points: points.append(pp) dist = np.linalg.norm(np.array(player) - np.array(p)) if not min_dist or dist < min_dist: closest, min_dist = p, dist solve_tsp = False if my_dest: dist = np.linalg.norm(np.array(player) - np.array(my_dest)) if dist < 0.5: solve_tsp = True if my_dest is None: solve_tsp = True if len(points) < 2: solve_tsp = False if solve_tsp: # function for printing best found solution when it is found from time import clock init = clock() def report_sol(obj, s=""): print("cpu:%g\tobj:%g\ttour:%s" % \ (clock(), obj, s)) n, D = mk_matrix(points, distL2) niter = 50 tour, z = multistart_localsearch(niter, n, D) left, right = None, None for idx in tour: if tour[idx] == 0: if idx == len(tour) - 1: right = points[tour[0]] left = points[tour[idx - 1]] elif idx == 0: right = points[tour[idx + 1]] left = points[tour[len(tour) - 1]] else: right = points[tour[idx + 1]] left = points[tour[idx - 1]] left_d = np.linalg.norm(np.array(player) - np.array(left)) right_d = np.linalg.norm(np.array(player) - np.array(right)) if right_d > left_d: closest = left else: closest = right #print("optimal next :" , closest) dest_per_marine[str(group_id)] = closest #print("dest_per_marine", self.dest_per_marine) #dest_per_marine {'0': [56, 26], '1': [52, 6]} if closest: if group_id == 0: actions.append({ "base_action": group_id, "x0": closest[0], "y0": closest[1] }) else: actions.append({ "base_action": group_id, "x1": closest[0], "y1": closest[1] }) elif my_dest: if group_id == 0: actions.append({ "base_action": group_id, "x0": my_dest[0], "y0": my_dest[1] }) else: actions.append({ "base_action": group_id, "x1": my_dest[0], "y1": my_dest[1] }) else: if group_id == 0: actions.append({ "base_action": 2, "x0": 0, "y0": 0 }) else: actions.append({ "base_action": 2, "x1": 0, "y1": 0 }) # elif(len(group_list)>0): # # group_id = random.randint(0,len(group_list)-1) # actions.append({"base_action":group_id}) if group_id == 0: group_id = 1 else: group_id = 0 if "0" not in xy_per_marine: xy_per_marine["0"] = [0, 0] if "1" not in xy_per_marine: xy_per_marine["1"] = [0, 0] return actions, group_id, dest_per_marine, xy_per_marine
def solve_tsp(player_relative, selected, group_list, group_id, dest_per_marine): my_dest = None other_dest = None closest, min_dist = None, None actions = [] neutral_y, neutral_x = (player_relative == 1).nonzero() player_y, player_x = (selected == 1).nonzero() #for group_id in group_list: if("0" in dest_per_marine and "1" in dest_per_marine): if(group_id == 0): my_dest = dest_per_marine["0"] other_dest = dest_per_marine["1"] else: my_dest = dest_per_marine["1"] other_dest = dest_per_marine["0"] r = random.randint(0,1) if(len(player_x)>0) and r == 0 : player = [int(player_x.mean()), int(player_y.mean())] points = [player] for p in zip(neutral_x, neutral_y): if(other_dest): dist = np.linalg.norm(np.array(other_dest) - np.array(p)) if(dist<10): # print("continue since partner will take care of it ", p) continue pp = [p[0]//2*2, p[1]//2*2] if(pp not in points): points.append(pp) dist = np.linalg.norm(np.array(player) - np.array(p)) if not min_dist or dist < min_dist: closest, min_dist = p, dist solve_tsp = False if(my_dest): dist = np.linalg.norm(np.array(player) - np.array(my_dest)) if(dist < 2): solve_tsp = True if(my_dest is None): solve_tsp = True if(len(points)< 2): solve_tsp = False if(solve_tsp): # function for printing best found solution when it is found from time import clock init = clock() def report_sol(obj, s=""): print("cpu:%g\tobj:%g\ttour:%s" % \ (clock(), obj, s)) #print("points: %s" % points) n, D = mk_matrix(points, distL2) # multi-start local search #print("random start local search:", n) niter = 50 tour,z = multistart_localsearch(niter, n, D) #print("best found solution (%d iterations): z = %g" % (niter, z)) #print(tour) left, right = None, None for idx in tour: if(tour[idx] == 0): if(idx == len(tour) - 1): #print("optimal next : ", tour[0]) right = points[tour[0]] left = points[tour[idx-1]] elif(idx==0): #print("optimal next : ", tour[idx+1]) right = points[tour[idx+1]] left = points[tour[len(tour)-1]] else: #print("optimal next : ", tour[idx+1]) right = points[tour[idx+1]] left = points[tour[idx-1]] left_d = np.linalg.norm(np.array(player) - np.array(left)) right_d = np.linalg.norm(np.array(player) - np.array(right)) if(right_d > left_d): closest = left else: closest = right #print("optimal next :" , closest) dest_per_marine[str(group_id)] = closest #print("dest_per_marine", self.dest_per_marine) #dest_per_marine {'0': [56, 26], '1': [52, 6]} if(closest): actions.append({"base_action":2, "x0": closest[0], "y0": closest[1]}) elif(len(group_list)>0): group_id = random.randint(0,len(group_list)-1) actions.append({"base_action":group_id}) return actions, group_id, dest_per_marine
def main(argv): import a3c.common.a3c scenarios.load_scenarios() run_config = run_configs.get() interface = sc_pb.InterfaceOptions() interface.raw = False interface.score = True interface.feature_layer.width = 24 interface.feature_layer.resolution.x = FLAGS.screen_resolution interface.feature_layer.resolution.y = FLAGS.screen_resolution interface.feature_layer.minimap_resolution.x = 64 interface.feature_layer.minimap_resolution.y = 64 queue = FakeQueue() # shared.gamma_n = FLAGS.gamma ** FLAGS.n_step_return env = helpers.get_env_wrapper(False) s_space = env.observation_space.shape none_state = np.zeros(s_space) none_state = none_state.reshape(s_space) replay_agent = Agent(env.action_space.n, t_queue=queue, none_state=none_state) for fname in glob.glob(os.path.join(FLAGS.dir, '*.SC2Replay')): replay_data = run_config.replay_data(fname) start_replay = sc_pb.RequestStartReplay( replay_data=replay_data, options=interface, disable_fog=True, observed_player_id=1) game_version = get_game_version(replay_data) with run_config.start(game_version=game_version, full_screen=False) as controller: controller.start_replay(start_replay) feat = features.Features(controller.game_info()) obs = controller.observe() s = get_obs(env._input_layers, obs) results = 0 last_reward = 0 while True: actions = [] for a in obs.actions: try: temp = feat.reverse_action(a) x = 0 y = 0 if temp[0] not in [0, 7]: x = temp.arguments[1][0] y = temp.arguments[1][1] actions.append([env._actions.index(temp[0]), x ,y]) except ValueError: pass if len(actions) < 1: try: controller.step(FLAGS.step_mul) except ProtocolError: break; obs = controller.observe() s = get_obs(env._input_layers, obs) continue r = obs.observation.score.score controller.step(FLAGS.step_mul) obs = controller.observe() s_ = get_obs(env._input_layers, obs) if r == 0 and last_reward != 0: s_ = None print('Episode end') results += 1 if not FLAGS.raw_rewards: replay_agent.train(s, actions[0][0], actions[0][1], actions[0][2], r, s_) else: queue.put([s, actions[0][0], actions[0][1], actions[0][2], r, s_]) if obs.player_result: break if r == 0 and last_reward != 0: last_reward = 0 else: s = s_ last_reward = r with gzip.open('./replay_info/info.gz', 'wb+') as outfile: print('pushed: {}'.format(results)) #json.dump(queue.get(), outfile) pickle.dump(queue.get(), outfile)