def reward_fn(code_string): return misc.RewardInfo( episode_rewards=[float(ord(c)) for c in code_string], input_case=[], correct_output=[], code_output=[], input_type=misc.IOType.integer, output_type=misc.IOType.integer, reason='none')
def _score_string(self, string): actions = misc.bf_string_to_tokens(string) reward, correct = self.reward_fn(actions) return misc.RewardInfo(episode_rewards=[0.0] * (len(string) - 1) + [reward], input_case=None, correct_output=None, code_output=actions, input_type=None, output_type=misc.IOType.integer, reason='correct' if correct else 'wrong')
def _score_code(self, code): """Run test cases on code and compute reward. Args: code: A single BF code string. Returns: misc.RewardInfo namedtuple instance containing reward and code execution information, including inputs, expected outputs, code outputs, input and output types, and reason for the reward obtained. """ # Get list of 2-tuples, each containing an input sequence and an output # sequence. io_seqs = self.task.make_io_set() terminal_reward = 0.0 results = [] reason = 'correct' for input_seq, output_seq in io_seqs: eval_result = bf.evaluate( code, input_buffer=input_seq, timeout=0.1, max_steps=self.max_execution_steps, base=self.task.base, require_correct_syntax=self.require_correct_syntax) result, success = eval_result.output, eval_result.success if not success: # Code execution timed out. terminal_reward = self.failure_reward results = [] reason = eval_result.failure_reason break else: terminal_reward += self.reward_fn(result, output_seq, self.task.base) if result == output_seq: terminal_reward += self.correct_bonus # Bonus for correct answer. # Only add additional reward for shorter code. Subtracting reward # interferes with the main objective. Only optimize for length once # any solution is found. if self.min_code_length == self.max_code_length: terminal_reward += self.code_length_bonus else: terminal_reward += self.code_length_bonus * clipped_linear( x=len(code), x0=self.min_code_length, y0=1.0, slope=-self.time_penalty, y_range=(0.0, 1.0)) # reason remains 'correct' if it is already elif reason == 'correct': reason = 'wrong' results.append(result) # Return list of rewards, one for each char in the code. All are 0 except # for the terminal reward. terminal_reward /= self.best_reward return misc.RewardInfo( episode_rewards=[0.0] * (len(code) - 1) + [terminal_reward], input_case=misc.IOTuple(i for i, o in io_seqs), correct_output=misc.IOTuple(o for i, o in io_seqs), code_output=misc.IOTuple(results), input_type=self.input_type, output_type=self.output_type, reason=reason)