def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) action_size = env.action_space.n model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.train() if not args.on_policy: memory = EpisodicReplayMemory(args.memory_capacity, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values input = extend_input(state, action_to_one_hot(action, action_size), reward) policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Sample action action = policy.multinomial().data[ 0, 0] # Graph broken as loss for stochastic action calculated manually # Step next_state, reward, done, _ = env.step(action) next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(input, action, reward, policy.data) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies ), (policy, Q, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]])), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = Variable(torch.zeros(1, 1)) if not args.on_policy: # Save terminal state for offline training memory.append( extend_input(state, action_to_one_hot(action, action_size), reward), None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(Variable(input), (hx, cx)) Qret = Qret.detach() # Train the network on-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state hx, avg_hx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) cx, avg_cx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition input = torch.cat((trajectory.state for trajectory in trajectories[i]), 0) action = Variable( torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ])).unsqueeze(1) reward = Variable( torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ])).unsqueeze(1) old_policy = Variable( torch.cat((trajectory.policy for trajectory in trajectories[i]), 0)) # Calculate policy and values policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_input = torch.cat( (trajectory.state for trajectory in trajectories[i + 1]), 0) done = Variable( torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1)) # Do forward pass for all transitions _, _, Qret, _ = model(Variable(next_input), (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach() # Train the network off-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
class Parser: def __init__(self): self.questions = [] self.imageNames = [] self.option1s = [] self.option2s = [] self.body_coords = [] self.prefixes = [] self.layer = 1 self.correctAnswer = 0 self.rewardsWithoutE = [] self.states = [] self.hm = {} self.memory = [] self.output = [] self.epsilon = 0 self.minLen = 5 self.maxLen = 30 self.numNode = 22 # [nodeId, bubbleSiz, bubbleScale] self.bubbleSize = 3 self.bubbleScale = 3 ''' input: initial csv files array output: none side effect: initialize parser parameters with inital csv files ''' def parseInit(self, args): for arg in args: state = [] i = 0 for line in open(arg): if (i > 0): curRow = line.rstrip().split(',') if (not isFloat(curRow[2])): # last row for other info state.append(float(curRow[0])) self.imageNames.append(curRow[1]) self.questions.append(curRow[2]) self.option1s.append(curRow[3]) self.option2s.append(curRow[4]) imageId = 0 if (isFloat(curRow[1][7])): imageId = curRow[1][6:8] else: imageId = curRow[1][6] self.body_coords.append( 'task{0}_img{1}_all_layer_coords.csv'.format( curRow[1][4], imageId)) self.prefixes.append('task{0}_{1}_{2}_0-4_'.format( curRow[1][4], imageId, self.layer)) else: self.hm[i] = curRow[1] curRow = curRow[0:1] + curRow[2:] + [0] curRow = (list(map(float, curRow))) state = state + curRow i += 1 self.states.append(state) ''' input: number of samples to generate output: none side effect: generate output array, prepared to write into output files ''' def generateRandomDataset(self, num): hidden_size = 32 model = torch.load('training_cps/training1_2_layer2_1-0_270000.pt') # model = ActorCritic(STATE_SPACE, ACTION_SPACE, hidden_size, NUM_LAYERS) # pre_policy = np.zeros((1, 198)) # pre_policy = torch.FloatTensor(pre_policy) for index in range(len(self.states)): state = self.states[index] pre_state = torch.FloatTensor(np.zeros((1, 111))) pre_actionVal = 0 # file = open('policy.txt', 'w') for i in range(1, num + 1): count = 0 hx = Variable(torch.zeros(1, hidden_size)) # print(hx) cx = Variable(torch.zeros(1, hidden_size)) # state = self.state rand = random.randint(self.minLen, self.maxLen) reward = random.uniform(0, 10) tensorState = torch.zeros(1, STATE_SPACE) minS = float("inf") for timestep in range(1, rand): # each episode tensorState = torch.FloatTensor(np.array(state)).view( 1, STATE_SPACE) # new state actionSingleVal = random.randint(0, 197) policy, _, _, (hx, cx) = model(Variable(tensorState), (hx, cx)) #pre_policy = policy prob = random.uniform(0, 1) if (prob > self.epsilon): # for prob epsilon choose action # file.write(str(tensorState - pre_state) + '\n') actionSingleVal = policy.max(1)[1].data[0] if (actionSingleVal == pre_actionVal): count += 1 if (count > 2): randVal = random.randint(1, 22) actionSingleVal = (actionSingleVal + 7 * randVal) % 198 else: count = 0 pre_actionVal = actionSingleVal action = Parser._convertSingleToArray(actionSingleVal) question = action[0] # print(question) self.output.append([ self.prefixes[index] + 'trial_' + str(i) + '_' + str(timestep), 'Show me ' + self.hm[question], str(action[0]), str(action[1]), str(action[2]), self.imageNames[index], self.questions[index], self.option1s[index], self.option2s[index], self.body_coords[index] ]) # pre_policy = policy.data pre_state = tensorState # print(action[0]) state[(action[0] - 1) * 5 + 4] = 1 self.output.append([]) ''' input: output file name array output: none side effect: write output array constructed in generateRandomDataset to output files specified by output file name array ''' def writeToFile(self, arg): file = open(arg, 'w') file.write('timestep,question ,action_node_id, bubble_size, bubble_scale, '\ 'image_name, question, option1, option2,body_coords\n') for item in self.output: if (len(item) > 0): # not finished for current episode string = '' for feature in item: string += feature + ',' file.write(string[:-1]) else: string = '' for i in range(10): # fill 50 in the empty line string += str(-1) + ',' file.write(string[:-1]) # for 4 more columns to indicate 4 types of additional info file.write('\n') ''' input: output file name array output: none side effect: write output array constructed in generateRandomDataset to storage array ''' def _getStorage(self, several_outputs): scalaHm = {0: 1, 1: 9, 2: 15} sizeHm = {0: 1.45, 1: 2.15, 2: 3.15} hidden_size = 32 model = ActorCritic(STATE_SPACE, ACTION_SPACE, hidden_size, NUM_LAYERS) memory_capacity = 10000 max_episode_length = 10 self.memory = EpisodicReplayMemory(memory_capacity, max_episode_length) states = self.states # storage will have format [[[], [], []], [episode2], [], []]. the last # element of each epsidoe is [tensorState, final reward] storage = [] storage.append([]) episodeI = 0 for index in range(len(states)): state = states[index] hx = Variable(torch.zeros(1, hidden_size)) cx = Variable(torch.zeros(1, hidden_size)) tensorState = torch.zeros(1, STATE_SPACE) minS = float("inf") with open(several_outputs[index]) as f: next(f) for line in f: curRow = line.rstrip().split(',') if (curRow[0] == '-1' or curRow[0] == '50'): # end of an episode sigma = minS / (len(storage[episodeI])) entropy = 1 + 0.5 * math.log(39.4 * sigma) # plugging in same reward for all turns in episode... the ACER code uses lambda return to scale them # print(episodeI) storage[episodeI].append([ tensorState, self.rewardsWithoutE[episodeI] + entropy ]) episodeI += 1 # print(episodeI) if (episodeI >= 1583): break hx = Variable(torch.zeros(1, hidden_size)) cx = Variable(torch.zeros(1, hidden_size)) state = states[index] # prepare for next episode storage.append([]) minS = float("inf") else: tensorState = torch.FloatTensor(np.array(state)).view( 1, STATE_SPACE) [action_node_id, bubble_size, bubble_scale] = list(map(int, (curRow[2:5]))) actionSingleVal = (action_node_id - 1) * self.bubbleScale * self.bubbleSize + \ bubble_size * self.bubbleScale + bubble_scale minS = min( minS, pow(scalaHm[bubble_scale], 2) * pow(sizeHm[bubble_size], 2)) policy, _, _, (hx, cx) = model(Variable(tensorState), (hx, cx)) storage[episodeI].append( [tensorState, actionSingleVal, policy.data]) state[(action_node_id - 1) * 5 + 4] = 1 return storage ''' input: output file name array output: none side effect: write storage array into memory required for RL + RNN training ''' def writeBackMemory(self, several_outputs): storage = self._getStorage(several_outputs) # pdb.set_trace() for episode in storage: last = episode[-1] # last one is reward reward = last[1] for i in range(len(episode) - 1): each = episode[i] tensorState = each[0] actionSingleVal = each[1] policyData = each[2] self.memory.append(tensorState, actionSingleVal, reward, policyData) tensorState = last[0] self.memory.append(tensorState, None, None, None) # pdb.set_trace() ''' input: output file name array, reward file name array output: none side effect: append bubble lengths to the end of each row of corresponding reward files ''' def appendToAMTRewardsAndBubbleLen(self, several_outputs, several_rewards): storage = self._getStorage(several_outputs) # append to file episodeI = 0 for arg in several_rewards: inputFile = open('AMT_rewards/' + arg, 'r') outputFile = open( 'appended_AMT_rewards/' + arg[:-4] + '_appended.csv', 'w') firstLine = inputFile.readline() # firstLine.rstrip('\n') + ', "bubbleLen", "reward"\n' outputFile.write(firstLine) for line in inputFile: episode = storage[episodeI] episodeLen = len(episode) - 1 last = episode[-1] # last one is reward reward = last[1] outputFile.write( ('{0},{1},{2}\n').format(line.rstrip('\n'), str(episodeLen), str(reward))) episodeI += 1 ''' input: output file name array output: none side effect: append discourse information and structure to the end of each row of output files ''' def appendDiscourseAndStructure(self, structureFile, several_outputs): # storage = self._getStorage(several_outputs) # append to file parentHm, childrenHm = Parser._extractStructure(structureFile) for arg in several_outputs: inputFile = open(arg, 'r') outputFile = open('appended_outputs/' + arg[:-4] + '_appended.csv', 'w') firstLine = inputFile.readline() outputFile.write(firstLine.rstrip() + ', structure, discourse\n') pre_acton_node_id, pre_bubble_size, pre_bubble_scale = None, None, None index = 0 for line in inputFile: appended = '' curRow = line.rstrip().split(',') if (curRow[0] == '-1' or curRow[0] == '50'): # end of an episode appended = ', NA, NA' else: [action_node_id, bubble_size, bubble_scale] = list(map(int, (curRow[2:5]))) if pre_acton_node_id == None: appended = ', NA, NA' else: if (parentHm.get(pre_acton_node_id) == action_node_id): appended += ', bottomUp, ' elif (childrenHm.get(pre_acton_node_id) != None and action_node_id in childrenHm[pre_acton_node_id]): appended += ', topdown, ' elif (action_node_id == pre_acton_node_id): appended += ', alpha, ' else: appended += ', NA, ' if (action_node_id == pre_acton_node_id): if (pre_bubble_scale == bubble_scale and pre_bubble_size == bubble_size): appended += 'recurrence' elif (bubble_scale > pre_bubble_scale): appended += 'elaboration' elif (bubble_scale < pre_bubble_scale and bubble_size > pre_bubble_size): appended += 'summary' else: appended += 'restatement' else: appended += 'sequence' pre_acton_node_id, pre_bubble_size, pre_bubble_scale = action_node_id, bubble_size, bubble_scale outputFile.write(line.rstrip('\n') + appended + '\n') ''' input: reward file name array output: none side effect: extract reward with entropy to parser internal array ''' def readAMTBatch(self, args): i = 0 for arg in args: with open('AMT_rewards/' + arg) as f: next(f) for line in f: i += 1 curRow = line.rstrip().split(',') Qvalues = curRow[-3:] [Q1, Q2, Q3] = list(map(Parser._extractNumber, Qvalues)) # always Q1 should be the correct answer rewardWithoutE = (3 - 2 * Q1) * Q2 * Q3 + 3.6 * (2 - 2 * Q1) self.rewardsWithoutE.append(rewardWithoutE) #pdb.set_trace() ''' input: reward file output: none side effect: extract the top20 reward actions and return those (state, action) pair ''' def extractFirst20(self, rewardFile): inputFile = open('appended_AMT_rewards/' + rewardFile, 'r') outputFile = open(rewardFile[0:-4] + '_extracted.csv', 'w') firstLine = inputFile.readline() outputFile.write(firstLine) lines = [] index = 0 for line in inputFile: curRow = line.rstrip().split(',') lines.append([float(curRow[-1]), index, line]) index += 1 lines = sorted(lines, key=lambda x: -x[0]) retindexes = [] for i in range(20): outputFile.write(lines[i][-1]) retindexes.append(lines[i][1]) return retindexes ''' input: number of (state, action) pair to generate for each fixed length output: none side effect: extract the top20 reward actions and return those indexes ''' def generateDifferentLenBatches(self, num, fixedLens): hidden_size = 32 # training_cps/supervised_1_1_layer1_0-0_100000.pt model = torch.load('training_cps/training1_2_layer1_1-0_980000.pt') # model = ActorCritic(STATE_SPACE, ACTION_SPACE, hidden_size, NUM_LAYERS) # pre_policy = np.zeros((1, 198)) # pre_policy = torch.FloatTensor(pre_policy) state = self.states[0] pre_state = torch.FloatTensor(np.zeros((1, 111))) pre_actionVal = 0 # file = open('policy.txt', 'w') index = 0 for fixedLen in fixedLens: for i in range(1, num + 1): count = 0 hx = Variable(torch.zeros(1, hidden_size)) cx = Variable(torch.zeros(1, hidden_size)) reward = random.uniform(0, 10) tensorState = torch.zeros(1, STATE_SPACE) minS = float("inf") for timestep in range(1, fixedLen + 1): # each episode tensorState = torch.FloatTensor(np.array(state)).view( 1, STATE_SPACE) # new state actionSingleVal = random.randint(0, 197) policy, _, _, (hx, cx) = model(Variable(tensorState), (hx, cx)) #pre_policy = policy prob = random.uniform(0, 1) if (prob > self.epsilon): # for prob epsilon choose action # file.write(str(tensorState - pre_state) + '\n') actionSingleVal = policy.max(1)[1].data[0] if (actionSingleVal == pre_actionVal): count += 1 if (count > 2): randVal = random.randint(1, 22) actionSingleVal = (actionSingleVal + 7 * randVal) % 198 else: count = 0 pre_actionVal = actionSingleVal # print(actionSingleVal) action = Parser._convertSingleToArray( actionSingleVal.numpy()) question = action[0] # print(question) self.output.append([ self.prefixes[index] + str(fixedLen) + '_trial_' + str(i) + '_' + str(timestep), 'Show me ' + self.hm[question], str(action[0]), str(action[1]), str(action[2]), self.imageNames[index], self.questions[index], self.option1s[index], self.option2s[index], self.body_coords[index] ]) # pre_policy = policy.data pre_state = tensorState # print(action[0]) state[(action[0] - 1) * 5 + 4] = 1 self.output.append([]) ''' input: reward file output: none side effect: write back storage structure to memory required for RNN + RL training ''' def writeBackMemoryExtracted20(self, several_outputs): storage = self._getStorage(several_outputs) # pdb.set_trace() retindexes = self.extractFirst20('AMT1_1_layer2_1-0_appended.csv') for index in retindexes: episode = storage[index] last = episode[-1] # last one is reward reward = last[1] for i in range(len(episode) - 1): each = episode[i] tensorState = each[0] actionSingleVal = each[1] policyData = each[2] self.memory.append(tensorState, actionSingleVal, reward, policyData) tensorState = last[0] self.memory.append(tensorState, None, None, None) # pdb.set_trace() ''' input: node hierachy file output: extract structure relationship of each node from input file and return the parent and children map ''' @staticmethod def _extractStructure(structureFile): file = open(structureFile, 'r') childrenHm, parentHm = dict(), dict() next(file) for line in file: curRow = line.rstrip().replace('"', '').split(',') node_id = int(curRow[0]) parent = 'NA' if curRow[2] == 'NA' else int(curRow[2]) children = 'NA' if curRow[3] == 'NA' else list(map( int, curRow[3:])) if (parent != 'NA'): parentHm[node_id] = parent if (children != 'NA'): childrenHm[node_id] = children return parentHm, childrenHm ''' input: action single value output: convert single value to action value array ''' @staticmethod def _convertSingleToArray(actionSingleVal): action = [] question = actionSingleVal // 9 + 1 action.append(question) actionSingleVal = actionSingleVal % 9 action.append(actionSingleVal // 3) action.append(actionSingleVal % 3) return action @staticmethod def _extractNumber(val): return int(list(filter(str.isdigit, val))[0])