def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice("actions", starts=[0, action_index], ends=[-1, action_index + 1]), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) if self.maxq_learning: next_q_values = self.get_max_q_values( "next_states", self.get_possible_next_actions(), True) else: next_q_values = self.get_q_values("next_states", "next_actions", True) discount_blob = C2.ConstantFill("time_diff", value=self.rl_discount_rate) time_diff_adjusted_discount_blob = C2.Pow( discount_blob, C2.Cast("time_diff", to=caffe2_pb2.TensorProto.FLOAT)) q_vals_target = C2.Add( "rewards", C2.Mul( C2.Mul( C2.Cast("not_terminals", to=caffe2_pb2.TensorProto.FLOAT), # type: ignore time_diff_adjusted_discount_blob, broadcast=1, ), next_q_values, ), ) self.update_model("states", "actions", q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def _create_reward_train_net(self) -> None: self.reward_train_model = ModelHelper(name="reward_train_" + self.model_id) C2.set_model(self.reward_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice("actions", starts=[0, action_index], ends=[-1, action_index + 1]), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) self.update_model("states", "actions", "rewards") workspace.RunNetOnce(self.reward_train_model.param_init_net) workspace.CreateNet(self.reward_train_model.net) C2.set_model(None)
def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice( 'actions', starts=[0, action_index], ends=[-1, action_index + 1], ), boost, broadcast=1, ) C2.net().Sum(['rewards', action_boost], ['rewards']) if self.maxq_learning: next_q_values = self.get_max_q_values( 'next_states', self.get_possible_next_actions(), True, ) else: next_q_values = self.get_q_values('next_states', 'next_actions', True) q_vals_target = C2.Add( 'rewards', C2.Mul( C2.Mul( C2.Cast('not_terminals', to=caffe2_pb2.TensorProto.FLOAT), # type: ignore self.rl_discount_rate, broadcast=1, ), next_q_values)) self.update_model('states', 'actions', q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def _create_reward_train_net(self) -> None: self.reward_train_model = ModelHelper(name="reward_train_" + self.model_id) C2.set_model(self.reward_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice("actions", starts=[0, action_index], ends=[-1, action_index + 1]), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) self.update_model("states", "actions", "rewards") workspace.RunNetOnce(self.reward_train_model.param_init_net) self.reward_train_model.net.Proto().num_workers = ( RLTrainer.DEFAULT_TRAINING_NUM_WORKERS) self.reward_train_model.net.Proto().type = "async_scheduling" workspace.CreateNet(self.reward_train_model.net) C2.set_model(None)
def _forward_pass(cls, model, trainer, normalized_dense_matrix, actions): C2.set_model(model) parameters = [] q_values = "q_values" workspace.FeedBlob(q_values, np.zeros(1, dtype=np.float32)) trainer.build_predictor(model, normalized_dense_matrix, q_values) parameters.extend(model.GetAllParams()) action_names = C2.NextBlob("action_names") parameters.append(action_names) workspace.FeedBlob(action_names, np.array(actions)) action_range = C2.NextBlob("action_range") parameters.append(action_range) workspace.FeedBlob(action_range, np.array(list(range(len(actions))))) output_shape = C2.Shape(q_values) output_shape_row_count = C2.Slice(output_shape, starts=[0], ends=[1]) output_row_shape = C2.Slice(q_values, starts=[0, 0], ends=[-1, 1]) output_feature_keys = 'output/string_weighted_multi_categorical_features.keys' workspace.FeedBlob(output_feature_keys, np.zeros(1, dtype=np.int64)) output_feature_keys_matrix = C2.ConstantFill( output_row_shape, value=0, dtype=caffe2_pb2.TensorProto.INT64) # Note: sometimes we need to use an explicit output name, so we call # C2.net().Fn(...) C2.net().FlattenToVec( [output_feature_keys_matrix], [output_feature_keys], ) output_feature_lengths = \ 'output/string_weighted_multi_categorical_features.lengths' workspace.FeedBlob(output_feature_lengths, np.zeros(1, dtype=np.int32)) output_feature_lengths_matrix = C2.ConstantFill( output_row_shape, value=1, dtype=caffe2_pb2.TensorProto.INT32) C2.net().FlattenToVec( [output_feature_lengths_matrix], [output_feature_lengths], ) output_keys = 'output/string_weighted_multi_categorical_features.values.keys' workspace.FeedBlob(output_keys, np.array(['a'])) C2.net().Tile([action_names, output_shape_row_count], [output_keys], axis=1) output_lengths_matrix = C2.ConstantFill( output_row_shape, value=len(actions), dtype=caffe2_pb2.TensorProto.INT32) output_lengths = \ 'output/string_weighted_multi_categorical_features.values.lengths' workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().FlattenToVec( [output_lengths_matrix], [output_lengths], ) output_values = \ 'output/string_weighted_multi_categorical_features.values.values' workspace.FeedBlob(output_values, np.array([1.0])) C2.net().FlattenToVec([q_values], [output_values]) return parameters, q_values