kernel_regularizer=l2(1e-4), data_format='channels_first')(cv2) dense = Flatten()(cv3) dense = Dense(512, activation='relu', kernel_regularizer=l2(1e-4))(dense) buttons = Dense(nb_actions, activation='linear', kernel_regularizer=l2(1e-4))(dense) model = Model(inputs=frame, outputs=buttons) model.summary() processor = AtariDQfDProcessor() # record_demo_data('HeroDeterministic-v4', steps=50000, data_filepath='hero_expert.npy', frame_delay=0.03) # Load and process the demonstration data. expert_demo_data = processor.process_demo_data( load_demo_data_from_file('hero_expert.npy')) memory = PartitionedMemory(limit=1000000, pre_load_data=expert_demo_data, alpha=.4, start_beta=.6, end_beta=.6, window_length=WINDOW_LENGTH) policy = EpsGreedyQPolicy(.01) dqfd = DQfDAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True,
student_model = Model(inputs=sensors, outputs=s_actions) # "Expert" (regular dqn) model architecture e_dense = Flatten()(sensors) e_dense = Dense(64, activation='relu')(e_dense) e_dense2 = Dense(128, activation='relu')(e_dense) e_dense3 = Dense(64, activation='relu')(e_dense2) e_actions = Dense(nb_actions, activation='linear')(e_dense3) expert_model = Model(inputs=sensors, outputs=e_actions) processor = RocketProcessor() model_saves = './model_saves/' if __name__ == "__main__": if args.model == 'student': # load expert data expert_demo_data = load_demo_data_from_file(model_saves + 'demos.npy') expert_demo_data = reward_threshold_subset(expert_demo_data, 0) print(expert_demo_data.shape) expert_demo_data = processor.process_demo_data(expert_demo_data) # memory memory = PartitionedMemory(limit=500000, pre_load_data=expert_demo_data, alpha=.6, start_beta=.4, end_beta=.4, window_length=WINDOW_LENGTH) # policy policy = EpsGreedyQPolicy(.01) # agent dqfd = DQfDAgent(model=student_model, nb_actions=nb_actions,
plot_model(curiosity_inverse_model, show_shapes=True, to_file=plot_file_prefix + 'curiosity_inverse_model.png') ######## END INVERSE MODEL ################ model_saves = './demonstrations/' now = datetime.now() datestr = now.strftime("%m%d_%H%M%S") filename_append = args.filename_append environment_name = args.env if __name__ == "__main__": if args.model == 'student': # load expert data expert_demo_data = load_demo_data_from_file(model_saves + demonstrations_file) expert_demo_data = reward_threshold_subset(expert_demo_data, 0) print(expert_demo_data.shape) expert_demo_data = processor.process_demo_data(expert_demo_data) # memory memory = PartitionedMemory(limit=500000, pre_load_data=expert_demo_data, alpha=.6, start_beta=.4, end_beta=.4, window_length=WINDOW_LENGTH) # policy policy = EpsGreedyQPolicy(.01) # agent dqfd = DQfDAgent(model=student_model, nb_actions=nb_actions,
episode_start = 0 for i, transition in enumerate(demo_array): reward = transition[-2] reward = np.sign(reward) * np.log(1 + abs(reward)) episode_total += reward if transition[-1]: #terminal episode_rs[episode_start:i + 1] = episode_total episode_total = 0 episode_start = i + 1 return episode_rs def reward_threshold_subset(demo_array, reward_min): rs = calc_ep_rs(demo_array) cropped_demos = [] for i, transition in enumerate(demo_array): if rs[i] > reward_min: cropped_demos.append(transition) return np.array(cropped_demos) def demo_avg(demo_array): demo_array = np.array(list(set(calc_ep_rs(demo_array)))) return np.mean(demo_array) if __name__ == "__main__": record_demo_data('RocketLander-v0', steps=50000) print(len(load_demo_data_from_file()))