config=config, device=device, init_exp=config.init_exp, # initial exploration prob final_exp=config.final_exp, # final exploration prob anneal_steps=10000, # N steps for annealing exploration discount_factor=config.discounted_factor, # discount future rewards reg_param=0.01, # regularization constants max_gradient=5, # max gradient norms summary_every=100, batch_size=config.batch_size, verbose=True, with_bit=config.with_bit, replay=config.replay) user = Seq_User_Act(nlg_sample=True, nlg_template=False) system = LooseSystem(config=config) env = Enviroment(user=user, system=system, verbose=True, config=config) sys_act = None status = [] while True: print("-" * 20) # turker_response = state = env.reset(mode=MODE) # turker_response #state = state[0] sys_act = None # initial sys act total_rewards = 0 while True: # print(state) # print(env.system.state) if config.with_bit:
with_bit_all = True if args.nlg_sample: config.nlg_sample = True else: config.nlg_sample = False if args.save_dir: config.save_dir = args.save_dir if config.loose_agents: user = LooseUser(nlg_sample=False) system = LooseSystem() else: user = User(nlg_sample=False) system = System() env = Enviroment(user=user, system=system, verbose=True) sys_act = None status = [] state_dim = dialog_config.STATE_DIM num_actions = dialog_config.SYS_ACTION_CARDINALITY def run_one_dialog(env, pg_reinforce): print("Test Episode "+"-"*20) cur_mode = dialog_config.RL_TRAINING