def __init__(self, fqi, fe, epsilon=0.05): """ Creates an epsilon-greedy policy from the given FQI policy object. :param fqi: an FQI instance from the ifqi package :param fe: a feature extractor (method s_features(x) is expected) :param epsilon: exploration rate for the policy (0 <= epsilon <= 1) """ self.epsilon = epsilon self.fqi = None self.actions = None self.fe = None self.load_fe(fe) if isinstance(fqi, dict): self.fqi = FQI(**fqi) self.actions = fqi['discrete_actions'] else: self.load_fqi(fqi)
regressor = ActionRegressor(regressor, discrete_actions=selected_actions_values, tol=0.5, **fqi_regressor_params) # Create FQI model fqi_params = { 'estimator': regressor, 'state_dim': selected_states_dim, 'action_dim': selected_actions_dim, 'discrete_actions': selected_actions_values, 'gamma': mdp.gamma, 'horizon': args.iterations, 'verbose': True } fqi = FQI(**fqi_params) # Run FQI print('Running FQI...') print('Evaluating policy using model at %s' % args.path) fqi_time = time.time() # Save this for logging average_episode_duration = len(dataset) / np.sum(dataset[:, -1]) iteration_values = [] # Stores performance of the policy at each step fqi_fit_params = {} # Optional parameters for fitting FQI fqi_evaluation_params = { 'metric': 'cumulative', 'n_episodes': 1, 'selected_states': selected_states, 'max_ep_len': 2 * average_episode_duration }
check_dataset(dataset, state_dim, action_dim, reward_dim) # this is just a # check, it can be removed in experiments print('Dataset has %d samples' % dataset.shape[0]) # reward_idx = state_dim + action_dim # sast = np.append(dataset[:, :reward_idx], # dataset[:, reward_idx + reward_dim:-1], # axis=1) # r = dataset[:, reward_idx] sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim) fqi_iterations = mdp.horizon # this is usually less than the horizon fqi = FQI(estimator=regressor, state_dim=state_dim, action_dim=action_dim, discrete_actions=discrete_actions, gamma=mdp.gamma, horizon=fqi_iterations, verbose=True) fit_params = {} # fit_params = { # "n_epochs": 300, # "batch_size": 50, # "validation_split": 0.1, # "verbosity": False, # "criterion": "mse" # } fqi.partial_fit(sast, r, **fit_params)
# Run for e in range(config['experiment_setting']['evaluation']['n_experiments']): print('Experiment: %d' % (e + 1)) experiment_results = list() # Load dataset dataset = evaluation.collect_episodes( mdp, n_episodes=np.sort(config['experiment_setting']['evaluation'] ['n_episodes'])[-1]) print('Dataset has %d samples' % dataset.shape[0]) # Load FQI fqi = FQI(estimator=regressor, state_dim=state_dim, action_dim=action_dim, discrete_actions=discrete_actions, gamma=config['fqi']['gamma'], horizon=config['fqi']['horizon'], verbose=config['fqi']['verbose']) fit_params = config['fit_params'] if config['experiment_setting']['evaluation']['metric'] == 'n_episodes': for i in config['experiment_setting']['evaluation']['n_episodes']: episode_end_idxs = np.argwhere(dataset[:, -1] == 1).ravel() last_el = episode_end_idxs[i - 1] sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim, last_el + 1) fqi.fit(sast, r, **fit_params) experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))