示例#1
0
文件: agent.py 项目: BepfCp/RL-imple
 def learner(self,
             gamma=0.9,
             alpha=0.1,
             epsilon=1e-5,
             display=False,
             lambda_=None):
     self.state = self.env.reset()
     s0 = self.state
     if display:
         self.env.render()
     a0 = self.perform_policy(s0, epsilon=epsilon)
     time_in_episode, total_reward = 0, 0
     is_done = False
     while not is_done:
         s1, r1, is_done, info, total_reward = self.act(a0)
         if display:
             self.env.render()
         a1 = self.perform_policy(s1, epsilon=epsilon)
         old_q = get_dict(self.Q, s0, a0)
         q_prime = get_dict(self.Q, s1, a1)
         td_target = r1 + gamma * q_prime
         new_q = old_q + alpha * (td_target - old_q)
         set_dict(self.Q, new_q, s0, a0)
         s0, a0 = s1, a1
         time_in_episode += 1
     if display:
         print(self.experience.last_episode)
     return time_in_episode, total_reward
示例#2
0
 def learning_method(self,
                     gamma=0.9,
                     alpha=0.1,
                     epsilon=1e-5,
                     display=False,
                     lambda_=None):
     self.state = self.env.reset()
     s0 = self.state
     if display:
         self.env.render()
     # a0 = self.perform_policy(s0, epsilon)
     # print(self.action_t.name)
     time_in_episode, total_reward = 0, 0
     is_done = False
     while not is_done:
         # add code here
         a0 = self.perform_policy(s0, epsilon)
         s1, r1, is_done, info, total_reward = self.act(a0)
         if display:
             self.env.render()
         self.policy = greedy_policy
         a1 = greedy_policy(self.A, s1, self.Q)
         old_q = get_dict(self.Q, s0, a0)
         q_prime = get_dict(self.Q, s1, a1)
         td_target = r1 + gamma * q_prime
         #alpha = alpha / num_episode
         new_q = old_q + alpha * (td_target - old_q)
         set_dict(self.Q, new_q, s0, a0)
         # s0, a0 = s1, a1
         s0 = s1
         time_in_episode += 1
     if display:
         print(self.experience.last_episode)
     return time_in_episode, total_reward
示例#3
0
def policy_evaluate(episodes,V,Ns):
    for episode,r in episodes:
        for s,a in episode:
            ns = get_dict(Ns,s)
            v = get_dict(V,s)
            set_dict(Ns,ns+1,s)
            set_dict(V,v+(r-v)/(ns+1),s)
示例#4
0
 def learn_Q(self, episode, r):  #Learn the Q value from the state sequence
     '''
     Learn from an episode
     '''
     for s, a in episode:
         nsa = get_dict(self.Nsa, s, a)
         set_dict(self.Nsa, nsa + 1, s, a)
         q = get_dict(self.Q, s, a)
         set_dict(self.Q, q + (r - q) / (nsa + 1), s, a)
     self.total_learning_times += 1
示例#5
0
def policy_evaluate(episodes, V, Ns):
    '''统计一个状态的价值, 衰减因子为1, 中间状态的即时奖励为0, 递增式蒙特卡罗策略评估
        V,Ns保存着蒙特卡罗策略评估进程中的价值和统计次数数据,
        我们使用是的每次访问计数的方法'''
    for episode, r in episodes:
        for s, a in episode:
            ns = get_dict(Ns, s)
            v = get_dict(V, s)
            set_dict(Ns, ns + 1, s)
            set_dict(V, v + (r - v) / (ns + 1), s)
    pass
示例#6
0
    def learning_method(self,
                        lambda_=0.9,
                        gamma=0.9,
                        alpha=0.1,
                        epsilon=1e-5,
                        display=False):
        self.state = self.env.reset()
        s0 = self.state
        if display:
            self.env.render()
        a0 = self.perform_policy(s0, epsilon)
        # print(self.action_t.name)
        time_in_episode, total_reward = 0, 0
        is_done = False
        E = {}
        while not is_done:
            # add code here
            s1, r1, is_done, info, total_reward = self.act(a0)
            if display:
                self.env.render()
            a1 = self.perform_policy(s1, epsilon)

            q = get_dict(self.Q, s0, a0)  # old q
            q_prime = get_dict(self.Q, s1, a1)  # new q
            # delta = R + gamma * Q(s',s') - Q(s,s)
            delta = r1 + gamma * q_prime - q

            e = get_dict(E, s0, a0)
            e += 1
            set_dict(E, e, s0, a0)
            # for all s in S, a in A
            for s in self.S:
                for a in self.A:
                    e_value = get_dict(E, s, a)
                    old_q = get_dict(self.Q, s, a)
                    # Q(s,a) = Q(s,a) + alpha * delta * E(s,a)
                    new_q = old_q + alpha * delta * e_value
                    # E(s,a) = gamma * lambda * E(s,a)
                    new_e = gamma * lambda_ * e_value
                    set_dict(self.Q, new_q, s, a)
                    set_dict(E, new_e, s, a)
            # s=s', a=a'
            s0, a0 = s1, a1
            time_in_episode += 1
        if display:
            print(self.experience.last_episode)
        return time_in_episode, total_reward
示例#7
0
    def learning_method(self, gamma=0.9, alpha=0.1,epsilon=1e-5,display=False,lambda_=0.9 ):
        self.state = self.env.reset()
        s0 = self.state
        if display:
            self.env.render()
        a0 = self.perform_policy(s0, self.Q, epsilon)
        time_in_episode,total_reward = 0, 0
        is_done = False
        E = {}  # 效用值
        while not is_done:
            s1, r1, is_done, info, total_reward = self.act(a0)
            if display:
                self.env.render()
            a1 = self.perform_policy(s1, self.Q, epsilon)

            q = get_dict(self.Q, s0, a0)
            q_prime = get_dict(self.Q, s1, a1)
            delta = r1 + gamma * (q_prime - q)

            e = get_dict(E, s0, a0)
            e += 1
            set_dict(E, e, s0, a0)

            for s in self.S:
                for a in self.A:
                    e_value = get_dict(E, s, a)
                    old_q = get_dict(self.Q, s, a)
                    new_q = old_q + alpha * delta * e_value
                    new_e = gamma * lambda_ *e_value
                    set_dict(self.Q, new_q, s, a)
                    set_dict(E, new_e, s, a)
            s0, a0 = s1, a1
            time_in_episode += 1
        if display:
            print(self.experience.last_episode)
        return time_in_episode, total_reward
示例#8
0
def add_datauser(request):

	errors={}

	if request.POST:
		
		companies = {}
		storage = ""
		for i in request.POST.keys():
			if ("food" in i) or ("weapon" in i):
				quality_v = i[len(i)-1:]
				key = i[:len(i)-1]

				util.set_dict(companies,key,quality_v,request.POST[i])

		if "q_storage" in request.POST:
			storage = request.POST["q_storage"]
		name = request.POST["name"]
		er = request.POST["url_or_id"]
		er_id =0 
		url_citizen = ""
		error = False
		val = URLValidator()
		try:
		    val(er)
		    if er.index("erepublik.com/en/citizen/profile/") > -1:
		    	url_citizen = er
		    	er_id = er.split("/").pop()
		except Exception, e:
			errors['url'] = "url invalid"

		if util.is_int(er) and int(er) >0:
			er_id = er
			url_citizen = "http://www.erepublik.com/en/citizen/profile/"+er_id
		else:
			errors['id citizen'] = "id invalid"

		if er_id == 0 and len(url_citizen)>0:
			error = True

		else:
			error = False
			errors ={}

		if not error:
			try:

				cit=Citizen(name=name,citizen_id_er=er_id,url_citizen=url_citizen)
				cit.save()
				
				Tg = TraningGround(owner_citizen=cit)
				Tg.weights_room    = request.POST['TG1']
				Tg.climbing_center = request.POST['TG2']
				Tg.shooting_range  = request.POST['TG3']
				Tg.special_forces  = request.POST['TG4']
				Tg.save()

				sto=Storage(owner_citizen=cit)

				if(storage <> "" and  util.is_int(storage)):
					sto.quantity=int(storage)
			
				sto.save()
				for key in companies:
					company = util.get_company(companies[key])
					company.owner_citizen = cit
					company.save()
				
	
			except DatabaseError as e:
				transaction.rollback()
				errors[e.errno]=e.strerror
			except ValidationError as e:
				transaction.rollback()
				errors[e.errno]=e.strerror
    previous_y = int(round(inputs['previous_mouse']['repr']['ydata']))
    if current_x >= 0 and current_y >= 0 and previous_x >= 0 and previous_y >= 0:
        rr, cc = line(previous_x, previous_y, current_x, current_y)
        dd = np.zeros(len(rr), dtype=np.int64)
        #print(rr.dtype, cc.dtype, dd.dtype)
        # NEED TO EITHER COPY THIS IMAGE, OR ASSUME NON-SHARING (CURRENTLY THIS HOLDS)
        inputs['accum']['repr'][cc, rr, dd] = 1.0  # 255
    return {'current_image_mouse': inputs['accum']}


dmms.type_functions['image_mouse_type'] = draw_a_line_on_image

initial_output = {}

set_dict(initial_output, ['self', 'current matrix'], {
    'kind': 'matrix',
    'repr': {}
})  # this is where the network matrix sits

set_dict(initial_output['self']['current matrix']['repr'],
         ['self', 'accum', 'self', 'current matrix'],
         1)  # this is the "main 1" of that matrix

set_dict(initial_output['self']['current matrix']['repr'],
         ['main_mouse', 'previous', 'main_mouse', 'current_mouse'],
         1)  # mouse neuron in the network matrix

set_dict(initial_output['self']['current matrix']['repr'],
         ['image_mouse', 'accum', 'image_mouse', 'current_image_mouse'],
         1)  # accum connection for image_mouse neuron in the network matrix

set_dict(
示例#10
0
dmms.type_functions['accum matrix'] = lambda inputs: {
    'current matrix': {
        'kind':
        'matrix',
        'repr':
        dmms.add_nested_dict(
            copy.deepcopy(inputs['accum']['repr']), inputs['delta']['repr']
            if 'delta' in inputs else {})
    }
}

initial_output = {}

set_dict(initial_output, ['self', 'current matrix'], {
    'kind': 'matrix',
    'repr': {}
})  # this is where the network matrix sits

set_dict(initial_output['self']['current matrix']['repr'],
         ['self', 'accum', 'self', 'current matrix'],
         1)  # this is the "main 1" of that matrix

print('initial_output: ', initial_output)

initial_input = dmms.down_movement(initial_output)

print('initial input: ', initial_input)

second_output = dmms.up_movement(initial_input)

print('second output: ', second_output)