Пример #1
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        G = 0
        S = mountaincar.init()
        R, S = mountaincar.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S, :] + Q2[S, :]
            prob1 = np.random.random()
            if prob1 < eps:
                # explore
                A = np.random.choice([0, 1])
            else:
                # greedy
                A = Q.argmax()

            R, S_prime = mountaincar.sample(S, A)
            G += R
            S_prime = int(S_prime)

            prob2 = np.random.choice([1, 2])
            if prob2 == 1:
                Q1[S, A] = Q1[S, A] + alpha * (
                    R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A])
            else:
                Q2[S, A] = Q2[S, A] + alpha * (
                    R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A])

            S = S_prime
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
Пример #2
0
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        S = mountaincar.init()
        R, S = mountaincar.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S, :] + Q2[S, :]
            A = Q.argmax()
            R, S = mountaincar.sample(S, A)
            G += R

        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Пример #3
0
def loop():
    global S, acrl, steps, episodeNum, gotOut

    prev_features = get_features(S, True)
    updatePrototypePlacements()

    if steps % 100 == 0:
        print('num steps: ' + str(steps))

    A = acrl.getAction(prev_features)
    R, Snext = mountaincar.sample(S, A)
    acrl.R = R
    if steps >= 5000 or Snext == None or isnan(Snext[0]) or isnan(Snext[1]):
        print('num steps: ' + str(steps))
        print('Snext: ' + str(Snext))
        print('Breaking from episode: ' + str(episodeNum))
        episodeNum += 1
        if Snext == None or isnan(Snext[0]) or isnan(Snext[1]):
            gotOut = True
        return True
    acrl.Value(prev_features)
    acrl.Delta()
    next_features = get_features(Snext, False)
    acrl.Next_Value(next_features)
    acrl.Delta_Update()
    acrl.Average_Reward_Update()
    acrl.Trace_Update_Critic(prev_features)
    acrl.Weights_Update_Critic()
    acrl.Compatible_Features(A, prev_features)
    acrl.Trace_Update_Actor()
    acrl.Weights_Update_Actor()
    S = Snext
    steps += 1
    return False
Пример #4
0
 def act(self, action, discount):
   """ do an action and update Q given the discount factor and step size """
   if self._time < self._T:
       (r, sp) = mountaincar.sample(self._s, action)
       self._r_sum += r
       self._tr[self._time % self._n] = (self._s, action)
       if sp == None: # if terminal
           self._T = self._time + 1
           self._delta[self._time%self._n] = r - self._Qt[self._time%(self._n+1)] # TD error
       else: # commit the next action
           action = self.pick_action(sp) # select arbitrarily and store an action as A_(t+1)
           self._Qt[(self._time + 1)%(self._n+1)] = self._Q[sp, action] # Store Q(St+1;At+1) as Qt+1
           self._sigma[(self._time+1)%self._n] = self._sig
           self._delta[self._time%self._n] = r - self._Qt[self._time%(self._n+1)] + \
             discount*((1-self._sigma[(self._time+1)%self._n]) * self.expected_Q(sp) + self._sigma[(self._time+1)%self._n] * self._Q[sp, action])
           self._pi[(self._time+1)%self._n] = self.get_action_probability(sp, action)
       self._s = sp # update agent state
   self._tau = self._time + 1 - self._n # time whose estimate is being updated
   if self._tau >= 0:
       E = 1.0
       G = self._Qt[self._tau%(self._n+1)]
       for k in range(self._tau, int(min(self._time, self._T-1))+1):
           G += E * self._delta[k%self._n]
           E *= discount * ((1-self._sigma[(k+1)%self._n]) * self._pi[(k+1)%self._n] + self._sigma[(k+1)%self._n])
       s, a = self._tr[self._tau%self._n]
       self._Q[s, a] = G
   self._time += 1
   return action # return the committed next action
Пример #5
0
def loop():
	global S, acrl, steps, episodeNum, gotOut

	prev_features = get_features(S, True)
	updatePrototypePlacements()

	if steps % 100 == 0:
		print('num steps: ' + str(steps))


	A = acrl.getAction(prev_features)
	R,Snext = mountaincar.sample(S,A)
	acrl.R = R
	if steps >= 5000 or Snext == None or isnan(Snext[0]) or isnan(Snext[1]):
		print('num steps: ' + str(steps))
		print('Snext: ' + str(Snext))
		print('Breaking from episode: ' + str(episodeNum))
		episodeNum += 1
		if Snext == None or isnan(Snext[0]) or isnan(Snext[1]):
			gotOut = True
		return True
	acrl.Value(prev_features)
	acrl.Delta()
	next_features = get_features(Snext, False)
	acrl.Next_Value(next_features)
	acrl.Delta_Update()
	acrl.Average_Reward_Update()
	acrl.Trace_Update_Critic(prev_features)
	acrl.Weights_Update_Critic()
	acrl.Compatible_Features(A,prev_features)
	acrl.Trace_Update_Actor()
	acrl.Weights_Update_Actor()
	S = Snext
	steps += 1
	return False
Пример #6
0
def learnEpisode(alpha, eps, gamma, theta1, theta2):
        in1, in2 = mountaincar.init()
        currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state
        episodeReturn = 0
        step = 0
        while(True): # continue until we reach terminal state (None)
            action = epsGreedyPolicy(currentStates, eps, theta1, theta2)
            reward, nextStatePosVel = mountaincar.sample((in1, in2), action)
            episodeReturn += reward
            step += 1
            if nextStatePosVel:
                nextIn1, nextIn2 = nextStatePosVel
                nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings)
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                currentStates = nextStates
                in1, in2 = nextIn1, nextIn2
            else: # next state is terminal state
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                return episodeReturn, step
Пример #7
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        tileIndices = [-1] * numTilings
        pos, vel = mountaincar.init()
        state = (pos, vel)
        step = 0
        while state != None:
            tilecode(pos, vel, tileIndices)
            action = chooseaction(state, theta1, theta2)
            r, nstate = mountaincar.sample(state, action)
            tileIndices = [-1] * numTilings
            if nstate != None:
                if randint(0, 2) == 0:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta2) -
                                   Total(state, action, theta1))
                else:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta1) -
                                   Total(state, action, theta2))
            else:
                if randint(0, 2) == 0:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta1))
                else:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta2))
            state = nstate
            G += r
            step += 1
        #print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        avgrlist[episodeNum] += G
        avgslist[episodeNum] += step
        returnSum += G
    #print("Average return:", returnSum / numEpisodes)

    return returnSum, theta1, theta2, step
Пример #8
0
def learn():    
    runSum = 0.0
    for run in xrange(numRuns):
        theta = -0.01*rand(n)
        returnSum = 0.0
        for episodeNum in xrange(numEpisodes):
            step = 0
            G = 0        
            traces = zeros(n)
            S=mountaincar.init()
            # Until S is terminal:
            while S!=None:
                # Choose action
                tilecode(S,F)
                if rand() <= Emu:                 # randomly explore
                    a = randint(0, 2)
                else:                             # greedy action choice
                    a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)])
                # Replacing traces on indices where feature vector is 1
                for index in F:
                    traces[index+(a*numTiles)] = 1                     
                # Take action, observe r,Sp
                r,Sp=mountaincar.sample(S,a)
                G += r
                # If terminal action update theta and end episode
                if Sp == None:
                    delta = r - QValue(F,a,theta)
                    theta =  theta + alpha*delta*traces
                    break
                # Choose expected next action
                tilecode(Sp,Fp)
                ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)])
                # Update theta
                randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta)
                delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta)
                theta = theta + alpha*delta*traces
                # Decay every component
                traces = gamma*lmbda*traces
                S=Sp
                step += 1
            returnSum += G        
    
            print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
            episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1)
            episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1)
            returnSum = returnSum + G
        print "Average return:", returnSum/numEpisodes
        runSum += returnSum
    print "Overall performance: Average sum of return per run:", runSum/numRuns
    writeAverages(episodeReturn,episodeSteps)
Пример #9
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        #your code goes here (20-30 lines, depending on modularity)
        state = mountaincar.init()
        #q1 = [0] * 3 # state-action value q for each
        #q2 = [0] * 3
        #feature_vectors = np.zeros(n)

        while state != None:
            tileIndices = [-1]*numTilings
            tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity
            q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0
            q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1
            q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2
            Q = np.array([q0, q1, q2])

            # apply epsilon greedy to choose actions
            greedy = np.random.random()
            if(greedy >= epsilon):
                action = Q.argmax()
            else:
                action = np.random.randint(0,3)

            reward, nextS = mountaincar.sample(state, action)
            G = G + reward

            while nextS == None: # if next state is terminal state




        print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        returnSum += G
    print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Пример #10
0
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1):

    returnSum = 0.0
    avgEpisodeReturns = [0]*numEpisodes
    doubleQ = DoubleQ(alpha, epsilon)

    for run in range(numRuns):
        doubleQ.resetQ()
        for episodeNum in range(numEpisodes):
            print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....")
            G = 0
            isTerminal = False
            #initialize the mountain car
            stateTuple = mountaincar.init()
            state = tilecode(stateTuple[0], stateTuple[1])

            while (not isTerminal):
                action = doubleQ.policy(state)
                reward, stateTuple = mountaincar.sample(stateTuple, action)
                G+=reward
                if stateTuple:
                    nextState = tilecode(stateTuple[0], stateTuple[1])
                else:
                    nextState = None
                    
                doubleQ.learn(state, action, nextState, reward)           

                if not stateTuple:
                    isTerminal = True
                else:
                    state = nextState

            print("Run: ",  run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G)
            returnSum = returnSum + G
            avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] +  (1/(run+1))*(G - avgEpisodeReturns[episodeNum])

    return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
Пример #11
0
def test_params(_lmbda, _alpha, _epsilon):
	global theta, e
	Epi = Emu = _epsilon
	alpha = _alpha
	lmbda = _lmbda
	runSum = 0.0
	for run in xrange(numRuns):
		e = np.zeros(numTilings*n*3)
		theta = -0.01*np.random.random_sample(numTilings*n*3)
		returnSum = 0.0
		for episodeNum in xrange(numEpisodes):
		    G = 0
		    S = mountaincar.init()
		    step = 0
		    while(S!=None):
		        step+=1
		        A = epsilon_greedy_policy(S)
		        R, S_next = mountaincar.sample(S,A)
		        G+=R
		        #since value of terminal state is 0 by definition
		        #computation for delta is simplified
		        if(S_next==None):
		            delta = R - q(S,A)
		        else:
		            delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\
		                (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A)
		        e*=gamma*lmbda
		        tilecode(S[0], S[1], F)
		        for index in [i+A*numTilings*n for i in F]:
		            e[index] = 1
		        theta +=alpha*delta*e
		        S=S_next
		        if(step >10000): return -10000000000
		    returnSum = returnSum + G
		runSum += returnSum
	return runSum/numRuns
Пример #12
0
     
      #Summing up values for coasting 
      Q[1] += theta[features + numTiles]
   
      #Summing up values for accerlation
      Q[2] += theta[features + (2*numTiles)]
    
  #Selecting action to take
  if rand() <= Emu:
      action = randint(3)
  else:
      action = argmax(Q)
     
  
  #Taking the action. Store results  
  result = mountaincar.sample(state,action)
  G += result[0]
  nextState = result[1]
 
  #Calculating delta
  delta = result[0] - Q[action]
  
  
  #Updating traces
  for features in F:
      e[features + (action *numTiles)] = 1;
  
  #Breaking out if next state is none
  if nextState == None:
      theta = theta + (alpha * delta * e)
      break
        # repeat for each step of episode
        while S is not None:

            # initialize A
            A = 0

            # get a list of four tile indices
            tilecode(S[0], S[1], F)

            Q = Qs(F)

            # pick the action
            A = egreedy(Q, Emu)

            # observe reward, and next state
            R, Sprime = mountaincar.sample(S, A)

            delta = R - Q[A]

            G = G + R

            for i in F:
                # replacing traces
                e[i + (A * numTiles)] = 1

            # if S' is terminal, then update theta; go to next episode
            if Sprime == None:
                theta = theta + alpha * delta * e
                break

            tilecode(Sprime[0], Sprime[1], F)
        e = np.zeros(n)                                #Initialize eligibility vector
        steps = 0

        while (True):
            Q = [0, 0, 0]                              #The Q learning (S, A) pair with Feature
            A = 0                                           
            tilecode (S[0], S[1], F)                   #Get the (Position, velocity) and Fearture
            for j in range(3):	
                for i in F:
                    Q[j] = Q[j] + w[i + (j*9*9*4)]     # To compplete one tiling, 4 mapping is needed
	    
                if (random.uniform(0,1) < epsilon):    # Epsilon greedy
                    A = random.choice(actions)
                else: 
                    A = Q.index(max(Q))
                R,Sp  = mountaincar.sample (S,A)       # Learing update in one episode 
                delta = R - Q[A]
            G += R

            for i in F: e[i+(A*4*9*9)] = 1

            if (Sp == None): w += alpha*delta*e; break # If teminal state, end the episode

            Qp = [0,0,0]
            tilecode (Sp[0], Sp[1], F)
            for j in range(3):
                for i in F:
                    Qp[j] = Qp[j] + w[i + (j*9*9*4)]   # Update the next (S,A)

            steps += 1
            delta += Qp[argmax(Qp)]
Пример #15
0
    w = zeros(n)
    for episodeNum in range(numEpisodes):
        G = 0
        e = zeros(n)
        carState = mountaincar.init()
        while not carState==None:
            Qa = zeros(3)
            Fa = zeros(4)
            for a_poss in [0,1,2]:
                tilecode(carState,Fa)
                assert (sum(Fa) > 0) # make sure Fa is populated
                Qa[a_poss] = getStateActionValue(w,Fa,a_poss)

            # get an action, act on it, and observe the reward
            A = getEpsilonGreedyAction(Qa)
            R,carStateNew = mountaincar.sample(carState,A)
            G = G + R

            delta = R - Qa[A]

            for i in Fa: # for each active feature index
                e[i+numTiles*A] = 1

            # if the new state is the terminal state, update the weight vector and break
            if carStateNew==None:
                w = w + alpha*delta * e
                break

            # update values for the weight vector and the eligibility traces
            Qa = zeros(3)
            Fa = zeros(4)
        # repeat for each step of episode
        while S is not None:

            # initialize A 
            A = 0
            
            # get a list of four tile indices
            tilecode(S[0], S[1], F)
            
            Q = Qs(F)
            
            # pick the action
            A = egreedy(Q, Emu)
            
            # observe reward, and next state
            R, Sprime = mountaincar.sample(S, A)
            
            delta = R - Q[A]
            
            G = G + R
            
            for i in F:
                # replacing traces
                e[i + (A*numTiles)] = 1
            
            # if S' is terminal, then update theta; go to next episode
            if Sprime == None:
                theta = theta + alpha * delta * e
		break
            
            tilecode(Sprime[0], Sprime[1], F)
Пример #17
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        S = mountaincar.init()  # S[0] is the position, S[1] is the velocity
        #start = True
        step = 0
        while True:
            #print('$'*80)
            #print('new S: ', S)
            q1 = [0] * 3  # for each possible actions, each has a q value
            q2 = [0] * 3
            phi = [0] * n  # initialize the list of features ø
            tileIndices = [-1] * numTilings
            tilecode(S[0], S[1], tileIndices)
            #print('tileIndices: ', tileIndices)

            # choose action, from a epsilon greedy
            num = np.random.random()
            if (num >= epsilon):
                for possibleAction in range(0, 3):
                    # generate q value for each possible actions
                    for index in tileIndices:  # implementing the vector multiplication thetaT*phi
                        q1[possibleAction] = q1[possibleAction] + theta1[
                            possibleAction * numTiles + index] * 1
                        q2[possibleAction] = q2[possibleAction] + theta2[
                            possibleAction * numTiles + index] * 1
                action = argmax([a + b for a, b in zip(q1, q2)
                                 ])  # choose the greedy action
                #print('action is: ', action)
            else:
                action = np.random.randint(0,
                                           3)  # choose the stochastic action

            #print('action is: ', action)
            # actually generate the features, based on the action
            indices = [action * numTiles + index for index in tileIndices
                       ]  # indicates which position in phi is 1

            # sample the next S, reward
            reward, nextS = mountaincar.sample(S, action)
            #print('nextS:', nextS)
            #print('reward: ',reward)
            G = G + reward
            step += 1
            #print('G:', G)

            if nextS == None:
                # terminal S
                if np.random.randint(0, 2):
                    for i in indices:
                        theta1[i] = theta1[i] + alpha * (reward - q1[action])
                        #G = G+reward
                        #step+=1
                else:
                    for i in indices:
                        theta2[i] = theta2[i] + alpha * (reward - q2[action])
                        #G = G+reward
                        #step+=1
                break
            else:
                # not terminal S
                # need to compute phi for the next S
                nextQ1 = [0] * 3
                nextQ2 = [0] * 3
                #nextPhi = [0]*n
                nextTileIndices = [-1] * numTilings
                tilecode(nextS[0], nextS[1], nextTileIndices)
                #print('nextTileIndices: ', nextTileIndices)

                nextQ1 = Qs(nextTileIndices, theta1)
                nextQ2 = Qs(nextTileIndices, theta2)

                if np.random.randint(0, 2):  # with 0.5 probability
                    nextAction = argmax(nextQ1)
                    for i in indices:
                        theta1[i] = theta1[i] + alpha * (
                            reward + nextQ2[nextAction] - q1[action])

                else:  # with 0.5 probability
                    nextAction = argmax(nextQ2)
                    for i in indices:
                        theta2[i] = theta2[i] + alpha * (
                            reward + nextQ1[nextAction] - q2[action])
                    #print(theta2)

            S = nextS
        steps[episodeNum] = steps[episodeNum] + step
        returns[episodeNum] = returns[episodeNum] + G
        #print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        returnSum += G
    #print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Пример #18
0
    for episodeNum in xrange(numEpisodes):
        G = 0
        #your code goes here (20-30 lines, depending on modularity)
        step = 0.0
        s = mountaincar.init()
        trace = zeros(n)
        Q = zeros(3)
        while s is not None:
            step += 1
            tilecode(s[0], s[1], F)
            Q = Policy(F, 3, theta)
            if rand() <= epsilon:
                action = randint(0, 2)
            else:
                action = argmax(Q)
            r, sp = mountaincar.sample(s, action)
            delta = r - Q[action]
            G += r
            for i in F:
                trace[i + action * numTiles] = 1
            if sp == None:
                theta += alpha * delta * trace
                break
            tilecode(sp[0], sp[1], F)
            delta += max(Policy(F, 3, theta))
            theta += alpha * delta * trace
            trace = lmbda * trace * gamma
            s = sp

        print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
        returnSum = returnSum + G
    runs = np.zeros(numRuns)

    timeSteps = np.zeros((numRuns,numEpisodes))
    returns = np.zeros((numRuns,numEpisodes))
    for run in range(numRuns):
	mc = ACRL()
	returnSum = 0.0
	for episodeNum in range(numEpisodes):
	       S = mountaincar.init()
	       G = 0
	       steps = 0
	       mc.Erase_Traces()
	       while(1):
		   prev_features = get_features(S)	       
		   A = mc.getAction(prev_features)
		   R,Snext = mountaincar.sample(S,A)
		   mc.R = R
		   if steps >= 5000 or Snext == None: # or isnan(Snext[0]) or isnan(Snext[1]):
		       break
		   mc.Value(prev_features)
		   mc.Delta()
		   next_features = get_features(Snext)
		   mc.Next_Value(next_features)
		   mc.Delta_Update()
		   mc.Average_Reward_Update()
		   mc.Trace_Update_Critic(prev_features)
		   mc.Weights_Update_Critic()
		   mc.Compatible_Features(A,prev_features)
		   mc.Trace_Update_Actor()
		   mc.Weights_Update_Actor()
		   S = Snext
Пример #20
0
Файл: mcDP.py Проект: DZ9/qsigma
    return pos * len(vel_range) + vel


# add states and actions to mdp
mcar = MDP()
mcar.add_states(n_states + 1)  # add terminal state at end
for i in range(n_states):
    mcar.add_actions(i, n_actions)

# wire up mdp
print('building mdp...')
for p in pos_range:
    for v in vel_range:
        s = (p, v)
        for a in range(3):
            R, sp = mountaincar.sample(s, a)
            mcar.add_transition(state_id(s), a, (state_id(sp), R, 1.0))

# compute values
print('solving mdp...')
V = mcar.value_iteration(1.0)

# map values for plotting
print('mapping function...')
x = pos_range
y = vel_range
plot_V = np.zeros([len(y), len(x)])
for i in range(len(x)):
    for j in range(len(y)):
        plot_V[j, i] = -V[state_id((x[i], y[j]))]
Пример #21
0
        # initialize observation
        observation = mountaincar.init()

        # use function approximation to generate next state
        tilecode(observation[0], observation[1], state)

        # compute the Q values for the state and every action
        Q = Qs(state)

        terminal = False
        A = chooseAction(Q)
        unknownObs = observation
        
        if flipped:
            R, observation, terminal = mountaincar.sample(observation, A, terminal)
            someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps)
            for i in range(1, someRandomAmountOfTime):
                unknownR, unknownObs, terminal = mountaincar.sample(unknownObs, A, terminal)
                G += unknownR
            step += someRandomAmountOfTime

        # repeat for each step of episode
        while True:

            if not flipped:
                # take action a and get reward R and new observation
                R, observation, terminal = mountaincar.sample(unknownObs, A, terminal)
                # if newObservation is terminal
                if terminal:
                    w += alpha*delta*e
Пример #22
0
def learn():
    runSum = 0.0
    for run in xrange(numRuns):
        theta = -0.01 * rand(n)
        returnSum = 0.0
        for episodeNum in xrange(numEpisodes):
            step = 0
            G = 0
            traces = zeros(n)
            S = mountaincar.init()
            # Until S is terminal:
            while S != None:
                # Choose action
                tilecode(S, F)
                if rand() <= Emu:  # randomly explore
                    a = randint(0, 2)
                else:  # greedy action choice
                    a = argmax([
                        QValue(F, 0, theta),
                        QValue(F, 1, theta),
                        QValue(F, 2, theta)
                    ])
                # Replacing traces on indices where feature vector is 1
                for index in F:
                    traces[index + (a * numTiles)] = 1
                # Take action, observe r,Sp
                r, Sp = mountaincar.sample(S, a)
                G += r
                # If terminal action update theta and end episode
                if Sp == None:
                    delta = r - QValue(F, a, theta)
                    theta = theta + alpha * delta * traces
                    break
                # Choose expected next action
                tilecode(Sp, Fp)
                ap = argmax([
                    QValue(Fp, 0, theta),
                    QValue(Fp, 1, theta),
                    QValue(Fp, 2, theta)
                ])
                # Update theta
                randomAction = (Epi / 3) * QValue(
                    Fp, 0, theta) + (Epi / 3) * QValue(
                        Fp, 1, theta) + (Epi / 3) * QValue(Fp, 2, theta)
                delta = r + randomAction + (1 - Epi) * QValue(
                    Fp, ap, theta) - QValue(F, a, theta)
                theta = theta + alpha * delta * traces
                # Decay every component
                traces = gamma * lmbda * traces
                S = Sp
                step += 1
            returnSum += G

            print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
            episodeReturn[episodeNum] += (G - episodeReturn[episodeNum]) / (
                numRuns + 1)
            episodeSteps[episodeNum] += (step - episodeSteps[episodeNum]) / (
                numRuns + 1)
            returnSum = returnSum + G
        print "Average return:", returnSum / numEpisodes
        runSum += returnSum
    print "Overall performance: Average sum of return per run:", runSum / numRuns
    writeAverages(episodeReturn, episodeSteps)
Пример #23
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        S = mountaincar.init()
        step = 0
        while (S):
            indexList = [-1] * numTilings
            tilecode(S[0], S[1], indexList)
            indexList = np.array(indexList)
            q0 = qVal(theta1, indexList) + qVal(theta2, indexList)
            q1 = qVal(theta1, indexList + numTiles) + qVal(
                theta2, indexList + numTiles)
            q2 = qVal(theta1, indexList + 2 * numTiles) + qVal(
                theta2, indexList + 2 * numTiles)
            Q = np.array([q0, q1, q2])

            prob1 = np.random.random()
            if prob1 < epsilon:
                # explore
                A = np.random.choice([0, 1, 2])
            else:
                # greedy
                A = Q.argmax()

            R, S_prime = mountaincar.sample(S, A)
            G += R

            prob2 = np.random.choice([1, 2])
            if prob2 == 1:
                theta_n = theta1
                theta_prime = theta2
            else:
                theta_n = theta2
                theta_prime = theta1
            indexList = [x + A * numTiles for x in indexList]
            qval_theta_n = qVal(theta_n, indexList)

            if not S_prime:
                for index in indexList:
                    theta_n[index] = theta_n[index] + alpha * (R -
                                                               qval_theta_n)
                break

            indexList_prime = [-1] * 4
            tilecode(S_prime[0], S_prime[1], indexList_prime)
            indexList_prime = np.array(indexList_prime)

            q0_n = qVal(theta_n, indexList_prime)
            q1_n = qVal(theta_n, indexList_prime + numTiles)
            q2_n = qVal(theta_n, indexList_prime + 2 * numTiles)
            A_prime = np.array([q0_n, q1_n, q2_n]).argmax()
            q_prime_max = qVal(theta_prime,
                               A_prime * numTiles + indexList_prime)

            for index in indexList:
                theta_n[index] = theta_n[index] + alpha * (R + q_prime_max -
                                                           qval_theta_n)

            S = S_prime
            step += 1

        # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G
    # print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Пример #24
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    runEpisodeReturns = []

    for episodeNum in range(numEpisodes):
        G = 0
        step = 0
        currentState = mountaincar.init()
        terminate = False
        while not terminate:
            action = argmax([
                qHat(currentState, 0, theta1) + qHat(currentState, 0, theta2),
                qHat(currentState, 1, theta1) + qHat(currentState, 1, theta2),
                qHat(currentState, 2, theta1) + qHat(currentState, 2, theta2)
            ])
            R, nextState = mountaincar.sample(currentState, action)
            if (nextState is None):

                if randint(0, 2) == 0:  # 0.5 probability
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta1[phi[i] + (action * numTiles)] += alpha * (
                            R - qHat(currentState, action, theta1))

                else:  # 0.5 probability
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta2[phi[i] + (action * numTiles)] += alpha * (
                            R - qHat(currentState, action, theta2))
                terminate = True

            else:

                if randint(0, 2) == 0:  #0.5 probability
                    nextAction = argmax([
                        qHat(nextState, 0, theta1),
                        qHat(nextState, 1, theta1),
                        qHat(nextState, 2, theta1)
                    ])
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta1[phi[i] + (action * numTiles)] += alpha * (
                            R + qHat(nextState, nextAction, theta2) -
                            qHat(currentState, action, theta1))
                else:  #0.5 probability
                    nextAction = argmax([
                        qHat(nextState, 0, theta2),
                        qHat(nextState, 1, theta2),
                        qHat(nextState, 2, theta2)
                    ])
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta2[phi[i] + (action * numTiles)] += alpha * (
                            R + qHat(nextState, nextAction, theta1) -
                            qHat(currentState, action, theta2))
                currentState = nextState

            #print("Episode: ", episodeNum, "Return: ", G)
            G = G + R
            step += 1
        runEpisodeReturns.append(G)
        # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G

    #print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2, runEpisodeReturns
    returnSum = 0.0
    for episodeNum in xrange(numEpisodes):
        G = 0
        step = 0
        e = np.zeros([numTiles,3])
        (position, velocity) = mountaincar.init()
        while 1: 
            tilecode(position, velocity, F)
            Q = np.sum(theta[F],axis=0) 

            if np.random.random() > epsilon:
                A = np.argmax(Q)
            else:
                A = np.random.randint(numActions)
     
            R, result = mountaincar.sample((position, velocity), A)
            error = R - Q[A]
            eOld = copy.copy(e)
            e[F,A] = 1
            G += R
            if result == None:
                theta = theta + alpha * error * e
                break

            newPosition,newVelocity = result
            oldF = copy.copy(F)
            tilecode(newPosition, newVelocity, F)
            
            Q = np.sum(theta[F],axis=0)

            error = error + (1 - epsilon) * np.max(Q) + epsilon \
Пример #26
0
runSum = []
for run in range(numRuns):
    w = -0.01*rand(n)
    returnSum = 0.0
    
    for episodeNum in range(numEpisodes):
        zerovec = zeros(n)
        G = 0
        A = 0
        S = mountaincar.init()
        F = actionTileCode(F,S,A)
        zerovec[F] = 1
        episodeLen = 0
        while(S is not None):
            episodeLen = episodeLen + 1
            RSA = mountaincar.sample(S,A)
            R = RSA[0]
            S = RSA[1]
            G = G + R
            delta = R - sum(w[F])
            q = zeros(3)
            
            if(S is not None):
                for a in range(3):
                    F = actionTileCode(F,S,a)
                    q[a] = sum(w[F])
            else:
                w = w + alpha*delta*zerovec
                break
                    
            expected_q = getExpected(q)
Пример #27
0
 theta = -0.01 * numpy.random.rand(n)
 returnSum = 0.0
 for episodeNum in xrange(numEpisodes):
     G = 0
     #your code goes here (20-30 lines, depending on modularity)
     steps = 0
     e = numpy.zeros(n)
     s = mc.init()
     Q = numpy.zeros(numActions)
     while s != None:
         #print Q
         steps += 1
         tilecode(s[0], s[1], F)
         Q = Qs(F)
         a = numpy.argmax(Q)
         r, s1 = mc.sample(s, a)
         G += r
         delta = r - Q[a]
         for i in F:
             e[i + a * 324] = 1
         if s1 == None:
             for i in range(n):
                 theta[i] += alpha * delta * e[i]
             break
         tilecode(s1[0], s1[1], F)
         Q = Qs(F)
         delta = delta + numpy.max(Q)
         for i in range(n):
             theta[i] += alpha * delta * e[i]
             e[i] = lmbda * e[i]
         s = s1
Пример #28
0
	for episodeNum in xrange(numEpisodes):
		G = 0	
#	your code goes here (20-30 lines, depending on modularity)
		step=0
		e=np.zeros(n)
		s=mc.init()
		Q=np.zeros(numActions)
		while s!=None:
			step=step+1
			tilecode(s[0],s[1],F)
			Q=np.zeros(numActions)
			for a in range(3):
				for _ in F:
					Q[a]=Q[a]+theta[_+a*324]
			a=np.argmax(Q)
			r, s1=mc.sample(s,a)
			G+=r
			delta=r-Q[a]
			for i in F:
				e[i+a*324]=1
			if s1==None:
				for i in range(n):
					theta[i]=theta[i]+alpha*delta*e[i]
				break
			tilecode(s1[0],s1[1],F)
			Q=np.zeros(numActions)
			for a in range(3):
				for i in F:
					Q[a]=Q[a]+theta[i+a*324]
			delta=delta+np.max(Q)
			for _ in range(n):
Пример #29
0
returnsArray = numpy.zeros((numRuns,numEpisodes))
runSum = 0.0
for run in range(numRuns):
    w = -0.01*pylab.rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        state = mountaincar.init()
        e = numpy.zeros(n)
        steps = 0
        while state != None:
            Tilecoder.tilecode(state[0], state[1], listOfTiles)
            Q = getQValues(w)
            action = eGreedy(Q)
      
            reward, statePrime = mountaincar.sample(state, action)
            G = G + reward
            delta = reward - Q[action]
      
            for index in listOfTiles:
                e[(numTiles*action)+index] = 1   
    
            if statePrime == None:           
                for i in range(len(w)):
                    w[i] = w[i] + alpha * delta * e[i]
                    
                state = statePrime
                
            else:
                Tilecoder.tilecode(statePrime[0], statePrime[1], listOfTiles)
                Q = getQValues(w)              
Пример #30
0
 et = numpy.zeros(n)
 step = 0
 
 
 while St != None:  
     
     step+=1
     tilecode(St[0],St[1],F)
     Q=newQ(F)
     
     # policy here, if Epi is changed, action may select differently
     action = numpy.argmax(Q)
     if Epi > random_sample():
         action = randint(0,3)
         
     r, St1 = mountaincar.sample(St,action)
     G+=r
     delta=r-Q[action]
     for i in F:
             et[i+action*e_para]=1
     if St1 == None:
         for i in range(n):
             theta[i]+=alpha*delta*et[i]
         break
     tilecode(St1[0],St1[1],F)
     Q=newQ(F)
     delta=delta+numpy.max(Q)
     
     for i in range(n):
         theta[i]+=alpha*delta*et[i]
         et[i]=lmbda*et[i]
Пример #31
0
 for episodeNum in xrange(numEpisodes):
     G = 0
     #your code goes here (20-30 lines, depending on modularity)
     step=0.0
     s = mountaincar.init()
     trace=zeros(n)
     Q=zeros(3)
     while s is not None:
         step += 1
         tilecode(s[0],s[1],F)
         Q = Policy(F,3,theta)
         if rand() <= epsilon:
             action = randint(0,2)
         else:
             action = argmax(Q)
         r, sp = mountaincar.sample(s,action)
         delta=r-Q[action]
         G+=r
         for i in F: trace[i+action*numTiles] = 1
         if sp == None:
             theta += alpha*delta*trace
             break
         tilecode(sp[0],sp[1],F)
         delta += max(Policy(F,3,theta))
         theta += alpha * delta * trace
         trace=lmbda * trace *gamma
         s = sp
     
     
     print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
     returnSum = returnSum + G
Пример #32
0
def trueOnlinePolicyGradient():
	# logging.basicConfig(filename='example.log',level=logging.DEBUG)
	for alpha_v in alpha_v_list:
		alpha_v = alpha_v * 1.0 / num_tilings
		for alpha_pi in alpha_pi_list:
			alpha_pi = alpha_pi * 1.0 / num_tilings
			print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi

			avg_steps_overall = 0.0
			avg_steps_per_run = np.zeros((num_runs, ))
			avg_steps_per_episode = np.zeros((num_episodes, ))

			start_time = time.clock()
			for current_run in range(num_runs):
				logging.debug("Run #:" + str(current_run))
				# print 'Run #:', current_run
				theta = 0.00001 * np.random.randn(mem_size, num_actions)
				w = 0.00001 * np.random.randn(mem_size, )
				# w_old = np.zeros((mem_size, ))
				v_old = 0.0

				steps_per_episode = np.zeros((num_episodes, ))
				avg_steps = 0.0

				for current_episode in range(num_episodes):

					# if (current_episode+1) % 10 == 0:
					# 	plotWeights(theta, w, current_episode)

					G = 0.0
					step = 0

					z_theta = np.zeros((mem_size, num_actions))
					z_theta_old = np.zeros((mem_size, num_actions))
					z_w = np.zeros((mem_size, ))

					(pos, vel) = mountaincar.init()
					phi = np.zeros((mem_size, ))
					tiled_indices = tilecode(pos, vel)
					phi[tiled_indices] = 1
					current_state = (pos, vel)
					(a_star, PG_star) = sampleAction(theta, phi)

					a_prime = 0
					PG_prime = np.zeros((mem_size, num_actions))

					while current_state is not None and step < max_steps:
						reward, next_state = mountaincar.sample(current_state, a_star)

						G += (gamma * reward)
						step += 1

						v_current = np.dot(w.transpose(), phi)
						v_next = 0.0
						phi_prime = np.zeros((mem_size, ))
						if next_state is not None:
							tiled_indices = tilecode(next_state[0], next_state[1])
							phi_prime[tiled_indices] = 1
							v_next = np.dot(w.transpose(), phi_prime)
							(a_prime, PG_prime) = sampleAction(theta, phi_prime)
						delta = reward + (gamma * v_next) - v_current

						# z_w = (gamma * lmbda * z_w) + phi - (alpha_v * gamma * lmbda * np.dot(z_w.transpose(), phi) * phi)
						# w += (alpha_v * (delta + v_current - v_old) * z_w - alpha_v * (v_current - v_old) * phi)

						z_w = (gamma * lmbda * z_w) + phi
						w += (alpha_v * delta * z_w)

						# z_theta = (gamma * lmbda * z_theta) + PG_star
						# theta += ((alpha_pi * z_theta * delta) + ((alpha_pi * z_theta_old) * (v_current - v_old)))

						z_theta = (gamma * lmbda * z_theta) + PG_star
						theta += (alpha_pi * delta * z_theta)

						v_old = v_next
						z_theta_old = np.copy(z_theta)
						phi = np.copy(phi_prime)
						a_star = a_prime
						current_state = next_state
						PG_star = np.copy(PG_prime)

					# print '########### Episode: ', current_episode, ' Return: ', G, ' Steps: ', step, " Run: ", current_run
					steps_per_episode[current_episode] = step
					avg_steps += step
				avg_steps = avg_steps * 1.0 / num_episodes
				avg_steps_overall += avg_steps
				avg_steps_per_run[current_run] = avg_steps

				avg_factor = 1.0 / (current_run + 1)
				for episode_i in range(num_episodes):
					avg_steps_per_episode[episode_i] *= (1 - avg_factor)
					avg_steps_per_episode[episode_i] += (avg_factor * steps_per_episode[episode_i])

			end_time = time.clock()
			elapsed_time = (end_time - start_time) / 60.0
			print 'Elapsed time: ', elapsed_time
			# logging.debug('Elapsed time: ' + str(elapsed_time))
			avg_steps_overall = avg_steps_overall * 1.0 / num_runs
			std_error = 0.0
			for run_i in range(num_runs):
				avg_factor_run = 1.0 / (run_i + 1)
				std_error = ((1 - avg_factor_run) * std_error) + (avg_factor_run * (avg_steps_per_run[run_i] - avg_steps_overall) * (avg_steps_per_run[run_i] - avg_steps_overall))
			std_error = np.sqrt(std_error * 1.0 / num_runs)

			total_steps = avg_steps_overall * num_episodes * num_runs
			print 'Time per step: ', (elapsed_time * 1.0 / total_steps)
			print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi, ' lmbda: ', lmbda
			print  'average reward: ', -1.0 * avg_steps_overall, ' std. error: ', std_error
			print 'Policy gradient'
Пример #33
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    #Q=zeros(3)

    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        step = 0
        S = mountaincar.init()
        tileindec = tilecode(S[0], S[1], [-1] * numTilings)
        #        Q=Qs(tileindec,theta1)
        #        act=argmax(Q)

        #derivate=zeros(n)

        while S != None:
            step += 1
            #derivate=zeros(n)
            #tileindec=tilecode(S[0],S[1],[-1] * numTilings)
            #Q=Qs(tileindec,theta1)
            if random() < epsilon:
                act = randint(0, 3)
            else:
                act = argmax(Qs(tileindec, theta1 + theta2))

            R, Stemp = mountaincar.sample(S, act)

            G += R
            if Stemp == None:
                pro = randint(0, 2)
                if pro == 1:

                    q1 = Qs(tileindec, theta1)
                    q2 = Qs(tileindec, theta2)
                    update = alpha * (R + q2[argmax(q1)] - q1[act])
                for i in tileindec:
                    theta1[i + act * 324] += update
                break

                if pro == 0:

                    q1 = Qs(tileindec, theta1)
                    q2 = Qs(tileindec, theta2)
                    update = alpha * (R + q2[argmax(q1)] - q1[act])
                for i in tileindec:
                    theta2[i + act * 324] += update
                break
            else:
                tileindec_tem = tilecode(Stemp[0], Stemp[1], [-1] * numTilings)
                #            for i in tileindec:
                #                derivate[i+act*324]=1
                pro = randint(0, 2)
                if pro == 1:
                    if Stemp != None:
                        q1 = Qs(tileindec_tem, theta1)
                        q2 = Qs(tileindec_tem, theta2)
                        update = alpha * (R + q2[argmax(q1)] - q1[act])
                    for i in tileindec:
                        theta1[i + act * 324] += update

                else:
                    if Stemp != None:
                        q1 = Qs(tileindec_tem, theta1)
                        q2 = Qs(tileindec_tem, theta2)

                        update = alpha * (R + q1[argmax(q2)] - q2[act])
                    for i in tileindec:
                        theta2[i + act * 324] += update
                S = Stemp
                tileindec = tileindec_tem


#            for i in tileindec:
#                derivate[i+act*324]=1
#
#            if Stemp==None:
#                #print(Stemp)
#                for i in range(n):
#                    theta1[i]=theta1[i]+alpha*(R-Q[act])*derivate[i]
#                break;
#            else:
#
#                tileindec_tem=tilecode(Stemp[0],Stemp[1],[-1] * numTilings)
#                Q_tem=Qs(tileindec_tem,theta1)
#                #print(Q_tem)
#                act_tem=argmax(Q_tem)
#
#                for i in range(n):
#                    theta1[i]=theta1[i]+alpha*(R+gamma*(Q_tem[act_tem])-Q[act])*derivate[i]
#                S=Stemp
#                #print(S)

#        ...
#        your code goes here (20-30 lines, depending on modularity)
#        ...
        print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G
    print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
    for episodeNum in xrange(numEpisodes):
        eTrace = [0]*n
        G = 0
        delta = 0

        state = mountaincar.init()
        step = 0
        while state != None:
            step += 1

            tiles = tilecode(state[0], state[1],[-1]*numTilings)
            explore = (random.random() < epsilon)

            if explore:
                action = random.randint(0,2)
                reward, newState = mountaincar.sample(state, action)
            else:
                action = getBestAction(tiles, theta)
                reward, newState = mountaincar.sample(state, action)
            G += reward

            if newState != None:
                delta = reward + updateDelta(tiles, theta, action, newState)
                eTrace = updateETrace(eTrace, tiles, action)
                theta = updateTheta(theta, delta, eTrace)
            else:
                Qa = 0
                for i in tiles:
                    Qa += theta[i + action*4*81]
                delta = reward - Qa
                updateETrace(eTrace, tiles, action)
Пример #35
0
        # initialize observation
        observation = mountaincar.init()

        # use function approximation to generate next state
        tilecode(observation[0], observation[1], observation[2], state)

        # compute the Q values for the state and every action
        Q = Qs(state)

        terminal = False
        A = chooseAction(Q)
        unknownObs = observation
        
        if flipped:
            R, observation, terminal = mountaincar.sample(observation, A, terminal, False)
            someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps)
            for i in range(1, someRandomAmountOfTime):
                unknownR, observation, terminal = mountaincar.sample(observation, A, terminal, True)
                G += unknownR
            step += someRandomAmountOfTime

        # repeat for each step of episode
        while True:

            if not flipped:
                # take action a and get reward R and new observation
                R, observation, terminal = mountaincar.sample(observation, A, terminal, False)

                # if newObservation is terminal
                if terminal: