def __init__(self, nParams, nActions, approximationFunctionArgs, **kwargs): self.nParams = nParams self.nActions = nActions self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.actionSelectionMethod = getValueFromDict(kwargs, "actionSelectionMethod", "greedy") self.epsilon = getValueFromDict(kwargs, "epsilon", 0.0) self.tieBreakingMethod = getValueFromDict(kwargs, "tieBreakingMethod", "consistent") self.w = np.zeros([self.nParams], dtype=float) if (self.tieBreakingMethod == "arbitrary"): self.argmax_function = Numeric.argmax elif (self.tieBreakingMethod == "consistent"): self.argmax_function = np.argmax else: sys.exit( "ERROR: FunctionApproximationPolicy: tieBreakingMethod not recognized!" ) if (self.actionSelectionMethod == "egreedy"): self.actionSelection_function = selectAction_egreedy elif (self.actionSelectionMethod == "softmax"): self.actionSelection_function = selectAction_softmax elif (self.actionSelectionMethod == "greedy"): self.actionSelection_function = selectAction_greedy elif (self.actionSelectionMethod == "esoft"): self.actionSelection_function = selectAction_esoft else: sys.exit( "ERROR: FunctionApproximationPolicy: actionSelectionMethod not recognized!" )
def __init__(self, nParams, nActions, alpha, gamma, lambd, approximationFunctionArgs, actionSelectionMethod="egreedy", epsilon=0.01): self.name = "True Online SARSA" self.nParams = nParams self.nActions = nActions self.alpha = alpha self.gamma = gamma self.lambd = lambd self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.ftf = getValueFromDict(self.af_kwargs, "ftf") self.w = np.zeros([self.nParams], dtype=np.float) self.z = np.zeros([self.nParams], dtype=np.float) self.q_old = 0.0 self.policy = FunctionApproximationPolicy( self.nParams, self.nActions, self.af_kwargs, actionSelectionMethod=actionSelectionMethod, epsilon=epsilon)
def softmaxLinear(w, state, action=None, **kwargs): ftf = getValueFromDict(kwargs, "ftf") nActions = getValueFromDict(kwargs, "nActions") p = normalize_softmax( np.array( [np.dot(w.T, ftf(state, a, **kwargs)) for a in range(nActions)])) return p if action is None else p[action]
def __init__(self, nParams, nActions, alpha, gamma, lambd, approximationFunctionArgs, doAccumulateTraces=False, doClearTraces=False, actionSelectionMethod="egreedy", epsilon=0.01): self.name = "SARSA(Lambda)" self.nParams = nParams self.nActions = nActions self.alpha = alpha self.gamma = gamma self.lambd = lambd self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.ftf = getValueFromDict(self.af_kwargs, "ftf") self.doAccumulateTraces = doAccumulateTraces self.doClearTraces = doClearTraces self.w = np.zeros([self.nParams], dtype=np.float) self.z = np.zeros([self.nParams], dtype=np.float) self.policy = FunctionApproximationPolicy( self.nParams, self.nActions, self.af_kwargs, actionSelectionMethod=actionSelectionMethod, epsilon=epsilon)
def dLogSoftmaxLinear(w, state, action=None, **kwargs): ftf = getValueFromDict(kwargs, "ftf") nActions = getValueFromDict(kwargs, "nActions") features = np.array([ftf(state, a, **kwargs) for a in range(nActions)], dtype=float) p = softmaxLinear(w, state, **kwargs) expectation = np.dot(p, features) return features[action] - expectation
def __init__(self, nParams, alpha, gamma, approximationFunctionArgs): self.name = "Semi-gradient TD Prediction" self.nParams = nParams self.alpha = alpha self.gamma = gamma self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams], dtype=float)
def __init__(self, nParams, gamma, epsilon, approximationFunctionArgs): self.name = "LSTD" self.nParams = nParams self.gamma = gamma self.epsilon = epsilon self.af_kwargs = approximationFunctionArgs self.ftf = getValueFromDict(self.af_kwargs, "ftf") self.af = getValueFromDict(self.af_kwargs, "af") self.invA = np.eye(self.nParams) * self.epsilon self.b = np.zeros([self.nParams, 1]) self.w = np.zeros([self.nParams], dtype=float)
def __init__(self, nParams, alpha, gamma, lambd, approximationFunctionArgs): self.name = "Semi-Gradient TD(Lambda)" self.nParams = nParams self.alpha = alpha self.gamma = gamma self.lambd = lambd self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams], dtype=np.float) self.z = np.zeros([self.nParams], dtype=np.float)
def __init__(self, nParams, alpha, gamma, lambd, approximationFunctionArgs): self.name = "Online Lambda Return" self.nParams = nParams self.alpha = alpha self.gamma = gamma self.lambd = lambd self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams], dtype=np.float) self.bufferExperience = []
def __init__(self, nStates, nParams, alpha, gamma, approximationFunctionArgs): self.name = "Gradient Monte Carlo Prediction" self.nStates = nStates self.nParams = nParams self.alpha = alpha self.gamma = gamma self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.returns = {} self.visitCounts = np.zeros([self.nStates], dtype=int) self.w = np.zeros([self.nParams], dtype=float)
def fourier(state, action=None, **kwargs): nParams = getValueFromDict(kwargs, "nParams") stateNormFactor = getValueFromDict(kwargs, "stateNormFactor", 1.0) if action is None: nActions = 1 idx_action = 0 else: nActions = getValueFromDict(kwargs, "nActions") idx_action = action stateFeatureVectorSize = nParams//nActions stateFeatureVector = np.array([np.cos(i*np.pi*state*stateNormFactor) for i in range(stateFeatureVectorSize)], dtype=float) featureVector = np.zeros(nParams) featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector return featureVector
def radialBasisFunction(state, action=None, **kwargs): mu = getValueFromDict(kwargs, "mu") sigma = getValueFromDict(kwargs, "sigma") if action is None: nActions = 1 idx_action = 0 else: nActions = getValueFromDict(kwargs, "nActions") idx_action = action stateFeatureVector = np.exp(-((state-mu)**2)/(2*sigma**2)) stateFeatureVectorSize = len(stateFeatureVector) featureVector = np.zeros(stateFeatureVectorSize*nActions) featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector return featureVector
def __init__(self, nParams, alpha, beta, gamma, targetPolicy, approximationFunctionArgs): self.name = "Gradient TD Prediction" self.nParams = nParams self.alpha = alpha self.beta = beta self.gamma = gamma self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.ftf = getValueFromDict(self.af_kwargs, "ftf") self.policy = targetPolicy self.w = np.zeros([self.nParams], dtype=float) self.v = np.zeros([self.nParams], dtype=float)
def __init__(self, alpha_w, alpha_theta, gamma, nParams_w, approximationFunctionArgs, nParams_theta, nActions, policyApproximationFunctionArgs): self.name = "REINFORCE with Baseline" self.alpha_w = alpha_w self.alpha_theta = alpha_theta self.gamma = gamma self.nParams_w = nParams_w self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams_w], dtype=np.float) self.policy = ParametrizedPolicy(nParams_theta, nActions, policyApproximationFunctionArgs) self.bufferExperience = []
def __init__(self, nStates, nActions, nParams, gamma, alpha, thresh_convergence, expectedValueFunction, approximationFunctionArgs): self.name = "Semi-Gradient Policy Evaluation" self.nStates = nStates self.nActions = nActions self.nParams = nParams self.gamma = gamma self.alpha = alpha self.thresh_convergence = thresh_convergence self.computeExpectedValue = expectedValueFunction self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams], dtype=float)
def __init__(self, alpha_w, alpha_theta, gamma, nParams_w, approximationFunctionArgs, nParams_theta, nActions, policyApproximationFunctionArgs): self.name = "One step Actor-Critic" self.alpha_w = alpha_w self.alpha_theta = alpha_theta self.gamma = gamma self.nParams_w = nParams_w self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams_w], dtype=np.float) self.I = 1.0 self.policy = ParametrizedPolicy(nParams_theta, nActions, policyApproximationFunctionArgs)
def __init__(self, nStates, nActions, nParams, gamma, alpha, beta, reward, approximationFunctionArgs): self.name = "Expected TDC" self.nStates = nStates self.nActions = nActions self.nParams = nParams self.gamma = gamma self.alpha = alpha self.beta = beta self.reward = reward self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.ftf = getValueFromDict(self.af_kwargs, "ftf") self.w = np.zeros([self.nParams], dtype=float) self.v = np.zeros([self.nParams], dtype=float)
def calculateProjectionMatrix(nStates, nParams, ftf_args, mu): ftf = getValueFromDict(ftf_args, "ftf") X = np.zeros((nStates, nParams)) for s in range(nStates): X[s, :] = ftf(s, **ftf_args) D = np.diag(mu) return X @ np.linalg.pinv(X.T @ D @ X) @ X.T @ D
def selectAction_egreedy(actionValues, **kwargs): argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax) epsilon = kwargs["epsilon"] if np.random.rand() < epsilon: action = np.random.randint(0, len(actionValues)) else: action = argmax_function(actionValues) return action
def __init__(self, nParams, nActions, approximationFunctionArgs, weightInit="random"): self.nParams = nParams self.nActions = nActions self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.weightInit = weightInit if self.weightInit == "zeros": self.theta = np.zeros([self.nParams], dtype=float) elif self.weightInit == "random": self.theta = np.random.randn(self.nParams) else: sys.exit("ERROR: ParametrizedPolicy: weightInit not recognized!")
def selectAction_UCB(actionValues, **kwargs): argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax) c = kwargs["c"] t = kwargs["t"] N = kwargs["N"] if np.min(N) == 0: return np.argmin(N) else: return argmax_function(actionValues + c * np.sqrt(np.log(t) / N))
def __init__(self, alpha_w, alpha_theta, alpha_r, lambd_w, lambd_theta, nParams_w, approximationFunctionArgs, nParams_theta, nActions, policyApproximationFunctionArgs): self.name = "Actor-Criticwith Eligibility Traces (average reward)" self.alpha_w = alpha_w self.alpha_theta = alpha_theta self.alpha_r = alpha_r self.lambd_w = lambd_w self.lambd_theta = lambd_theta self.nParams_w = nParams_w self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams_w], dtype=np.float) self.z_w = np.zeros([self.nParams_w], dtype=np.float) self.z_theta = np.zeros([self.nParams_theta], dtype=np.float) self.avgR = 0.0 self.policy = ParametrizedPolicy(nParams_theta, nActions, policyApproximationFunctionArgs)
def polynomial(state, action=None, **kwargs): nParams = getValueFromDict(kwargs, "nParams") stateNormFactor = getValueFromDict(kwargs, "stateNormFactor", 1.0) c = getValueFromDict(kwargs, "c") if action is None: nActions = 1 idx_action = 0 else: nActions = getValueFromDict(kwargs, "nActions") idx_action = action stateFeatureVectorSize = nParams//nActions stateVector = buildStateVector(state*stateNormFactor) stateFeatureVector = np.ones(stateFeatureVectorSize) for i in range(stateFeatureVectorSize): for j in range(len(stateVector)): stateFeatureVector[i]*=stateVector[j]**c[i,j] featureVector = np.zeros(nParams) featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector return featureVector
def stateAggregation(state, action=None, **kwargs): ''' Simple tile coding with one grid tiling and one state dimension ''' nParams = getValueFromDict(kwargs, "nParams") nStates = getValueFromDict(kwargs, "nStates") if action is None: nActions = 1 idx_action = 0 else: nActions = getValueFromDict(kwargs, "nActions") idx_action = action stateFeatureVectorSize = nParams//nActions stateFeatureVector = np.zeros(nParams, dtype=int) mappedIdx = int(mapValues(state, 0, nStates, 0, nParams)) if mappedIdx>=0 and mappedIdx<nParams: stateFeatureVector[mappedIdx] = 1 featureVector = np.zeros(nParams) featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector return featureVector
def __init__(self, nParams, nActions, alpha, approximationFunctionArgs, actionSelectionMethod="egreedy", epsilon=0.01): self.name = "Generic SemiGradient TD Control Class" self.nParams = nParams self.nActions = nActions self.alpha = alpha self.af_kwargs = approximationFunctionArgs self.af = getValueFromDict(self.af_kwargs, "af") self.afd = getValueFromDict(self.af_kwargs, "afd") self.w = np.zeros([self.nParams], dtype=float) self.policy = FunctionApproximationPolicy( self.nParams, self.nActions, self.af_kwargs, actionSelectionMethod=actionSelectionMethod, epsilon=epsilon)
def tileCoding(state, action=None, **kwargs): ''' Tile coding with grid tiles ''' minStates = np.array(getValueFromDict(kwargs, "minStates")) maxStates = np.array(getValueFromDict(kwargs, "maxStates")) nTilings = getValueFromDict(kwargs, "nTilings") tilingOffsets = np.array(getValueFromDict(kwargs, "tilingOffsets")) tilingSize = np.array(getValueFromDict(kwargs, "tilingSize")) dimStates = len(minStates) if action is None: nActions = 1 idx_action = 0 else: nActions = getValueFromDict(kwargs, "nActions") idx_action = action stateVector = buildStateVector(state) stateFeatureVector = [] for idx_tiling in range(nTilings): tileVector = np.zeros(tilingSize[idx_tiling]) mappedIdx = mapValues(stateVector+tilingOffsets[idx_tiling], minStates, maxStates, np.zeros(dimStates), tilingSize[idx_tiling]) mappedIdx = np.array(mappedIdx, dtype=int) if min(mappedIdx>=np.zeros_like(mappedIdx))==True and min(mappedIdx<tilingSize[idx_tiling])==True: tileVector[tuple(mappedIdx)] = 1 stateFeatureVector.extend(tileVector.flatten()) stateFeatureVectorSize = len(stateFeatureVector) featureVector = np.zeros(nActions*stateFeatureVectorSize) featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector return np.array(featureVector, dtype=int).flatten()
def selectAction_esoft_(actionValues, **kwargs): # TODO: consider implementing this epsilon = kwargs["epsilon"] argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax) q_max = np.max(actionValues) n_greedy_actions = 0 greedy_actions = [] for i in range(len(actionValues)): if actionValues[i] == q_max: n_greedy_actions += 1 greedy_actions.append(i) non_greedy_action_probability = epsilon / len(actionValues) greedy_action_probability = ( (1.0 - epsilon) / n_greedy_actions) + non_greedy_action_probability p = np.zeros(len(actionValues)) + non_greedy_action_probability p[greedy_actions] = greedy_action_probability return np.random.choice(len(p), p=p)
def FixedStateEncoding(state, action=None, **kwargs): stateEncodingMatrix = getValueFromDict(kwargs, 'stateEncodingMatrix') if action is None: return stateEncodingMatrix[state,:].T else: return stateEncodingMatrix[state,action,:].T
def computeExpectedValue(self, state, action, w, af_kwargs, gamma): af = getValueFromDict(af_kwargs, "af") return np.sum( [self.stateTransitionProbs[state, action, next_state] * (self.defaultReward + gamma*af(w, next_state, **af_kwargs)) for next_state in range(self.nStates)] )
def selectAction_esoft(actionValues, **kwargs): argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax) epsilon = kwargs["epsilon"] p = np.zeros_like(actionValues) + epsilon / (len(actionValues) - 1) p[argmax_function(actionValues)] = 1.0 - epsilon return np.random.choice(len(p), p=p)