def train(train_csv=None, test_csv=None, full_train_data=None, min_split=1, lag=60, iterations=10, batch_size=10): # Input checking assert (full_train_data is not None) or (train_csv is not None), 'No expanded training dataset provided for FQI.' logging.info('Starting...') # ======== FQI data loading ======== if full_train_data is None: # Generate dataset directly from dataset_generation import generate fqi_data = generate(source=train_csv, lag=lag) logging.info('Generated FQI extended dataset: %s' % (fqi_data.shape, )) else: fqi_data = pd.read_csv(full_train_data) logging.info('Loaded FQI extended dataset: %s' % (fqi_data.shape, )) # ======== FQI data preparation ======== state_features, next_state_features = feature_selection(strategy='full', lag=lag) states_actions = fqi_data[state_features].values next_states = fqi_data[next_state_features].values rewards = fqi_data['reward'].values absorbing_states = fqi_data['done'].values logging.info('Separated columns for FQI.') # ======== Setting FQI parameters ======== # Create target environment to test during training training_env = VecTradingDerivatives(data=train_csv, n_envs=N_ENVS, maximum_drowdown=-1) logging.info('Creating training environment.') regressor_params = {'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': min_split, 'min_samples_leaf': 1, 'n_jobs': -1} actions = [-1, 0, 1] pi = EpsilonGreedy(actions, ZeroQ(), epsilon=0) # Greedy policy # Baseline score for the environment rets, _ = test_policy(training_env) logging.info('Random policy total profit: %s'%(np.sum(rets), )) # Create algorithm algorithm = FQI(training_env, pi, verbose = False, actions = actions, batch_size = batch_size, max_iterations = iterations, regressor_type = ExtraTreesRegressor, **regressor_params) logging.info('Algorithm set up, ready to go.') # ======== Training Loop ======== for i in range(iterations): algorithm._iter(states_actions, rewards, next_states, absorbing_states) logging.info('[ITERATION %s] Metric:'%(i+1,)) #pi.Q.set_regressor_params(n_jobs=1) rets, _ = test_policy(training_env, policy=algorithm._policy) #pi.Q.set_regressor_params(n_jobs=-1) logging.info('[ITERATION %s] Testing: %s'%(i+1, np.sum(rets))) # ======== Testing ======== # ======== Results ======== logging.info('End.')
def test_epsilon(self): actions = [10, 100, 1000] pi = EpsilonGreedy(actions, QFunction(), 0.5) self.assertEqual(0.5, pi.epsilon) pi.epsilon = 0.6 self.assertEqual(0.6, pi.epsilon) with self.assertRaises(AttributeError): pi.epsilon = -1 with self.assertRaises(AttributeError): pi.epsilon = 2 with self.assertRaises(AttributeError): del pi.epsilon
def fun(algorithm): policy = EpsilonGreedy(algorithm._actions, algorithm._policy.Q, 0) perf = evaluate_policy(algorithm._mdp, policy, criterion=criterion, n_episodes=n_episodes, initial_states=initial_states, n_threads=n_threads) fields = {} fields[field_name + "_mean"] = perf[0] fields[field_name + "_std"] = perf[1] fields[field_name + "_steps"] = perf[2] algorithm._result.update_step(**fields)
def __init__(self, ref_df, policy_type, policy_path, action_dispatcher_path): self.ref_df = ref_df self.tree = spatial.KDTree(list(zip(ref_df['xCarWorld'], ref_df['yCarWorld']))) self.end_of_lap = False # load policy object with open(policy_path, 'rb') as pol: pi = pickle.load(pol) if policy_type == 'greedy': epsilon = 0 self.policy = EpsilonGreedy(pi.actions, pi.Q, epsilon) elif policy_type == 'boltzmann': tau = 2 self.policy = Softmax(pi.actions, pi.Q, tau) elif policy_type == 'greedy_noise': epsilon = 0 std = 0.003 self.policy = EpsilonGreedyNoise(pi.actions, pi.Q, epsilon, std) # load action dispatcher object with open(action_dispatcher_path, 'rb') as ad: self.action_dispatcher = pickle.load(ad)
pre_callback_list = [] fit_params = {} max_iterations = 100 batch_size = 20 n_steps = 10 n_runs = 20 n_jobs = 5 """ --- WEIGHTS --- """ var_st = 0.1 var_rw = 0.1 """ --- WFQI --- """ pi = EpsilonGreedy(actions, ZeroQ(), 0.1) k1 = ConstantKernel(2.74**2, constant_value_bounds="fixed") * RBF( length_scale=1.51, length_scale_bounds="fixed") k2 = ConstantKernel(2.14**2, constant_value_bounds="fixed") * RBF( length_scale=0.92, length_scale_bounds="fixed") k3 = ConstantKernel(2.42**2, constant_value_bounds="fixed") * RBF( length_scale=2.47, length_scale_bounds="fixed") k4 = ConstantKernel(3.14**2, constant_value_bounds="fixed") * RBF( length_scale=2.76, length_scale_bounds="fixed") kernel_st = [k1, k2, k3, k4] kernel_rw = ConstantKernel(2.03**2, constant_value_bounds="fixed") * RBF( length_scale=2.57, length_scale_bounds="fixed") algorithm = WFQI(target_mdp,
target_mdp = PuddleWorld(goal_x=5, goal_y=10, puddle_means=[(1.0, 4.0), (1.0, 10.0), (1.0, 8.0), (6.0, 6.0), (6.0, 4.0)], puddle_var=[(.7, 1.e-5, 1.e-5, .7), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8)], puddle_slow=False) mdp = source_mdp_3 file_name = "source_policy_3" actions = [0, 1, 2, 3] pi = EpsilonGreedy(actions, ZeroQ(), 0.3) regressor_params = { 'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': 2, 'min_samples_leaf': 1 } fqi = FQI(mdp, pi, verbose=True, actions=actions, batch_size=50, max_iterations=60, regressor_type=ExtraTreesRegressor,
def run_experiment(track_file_name, rt_file_name, data_path, max_iterations, output_path, n_jobs, output_name, reward_function, r_penalty, rp_kernel, rp_band, ad_type, tuning, tuning_file_name, kdt_norm, kdt_param, filt_a_outliers, double_fqi, evaluation): # Load dataset and refernce trajectory print('Loading data') simulations = pd.read_csv(os.path.join(data_path, track_file_name + '.csv'), dtype={ 'isReference': bool, 'is_partial': bool }) ref_tr = pd.read_csv(os.path.join(data_path, rt_file_name + '.csv')) if r_penalty: print('Computing penalty') # Take as training laps the set of laps with lap time lower than the 1.5% of the reference trajectory # lap time all_laps = np.unique(simulations.NLap) lap_times = map( lambda lap: simulations[simulations.NLap == lap]['time'].values[ -1], all_laps) ref_time = ref_tr['time'].values[-1] perc_deltas = list( map(lambda t: (abs(t - ref_time) / ref_time * 100) <= 1.5, lap_times)) right_laps = all_laps[perc_deltas] p_params = {} if rp_band is not None: p_params['bandwidth'] = rp_band if rp_kernel is not None: p_params['kernel'] = rp_kernel penalty = LikelihoodPenalty(**p_params) penalty.fit( simulations[simulations.NLap.isin(right_laps)][state_cols].values) if reward_function == 'temporal': rf = Temporal_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'distance': rf = Spatial_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'speed': rf = Speed_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'curv': rf = Curv_temporal(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) else: if reward_function == 'temporal': rf = Temporal_projection(ref_tr) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr) elif reward_function == 'distance': rf = Spatial_projection(ref_tr) elif reward_function == 'speed': rf = Speed_projection(ref_tr) elif reward_function == 'curv': rf = Curv_temporal(ref_tr) dataset = to_SARS(simulations, rf) nmin_list = [1, 2, 5, 10, 15, 20] if tuning_file_name: print('Tuning file: {}'.format( os.path.join(output_path, tuning_file_name + '.pkl'))) with open(os.path.join(output_path, tuning_file_name + '.pkl'), 'rb') as tuning: gcv = pickle.load(tuning) else: print("Performing Tuning") gcv = run_tuning(dataset, nmin_list, double_fqi, n_jobs, output_path, reward_function + '_tuning') if double_fqi: mse = -(gcv[0].cv_results_['mean_test_score'] + gcv[1].cv_results_['mean_test_score']) / 2 nmin = nmin_list[np.argmin(mse)] else: nmin = gcv.best_params_['min_samples_leaf'] # Create environment state_dim = len(state_cols) action_dim = len(action_cols) mdp = TrackEnv(state_dim, action_dim, 0.99999, 'continuous') # Create policy instance epsilon = 0 pi = EpsilonGreedy([], ZeroQ(), epsilon) # Parameters of ET regressor regressor_params = { 'n_estimators': 100, 'criterion': 'mse', 'min_samples_split': 2, 'min_samples_leaf': nmin, 'n_jobs': n_jobs, 'random_state': 42 } regressor = ExtraTreesRegressor # Define the order of the columns to pass to the algorithm cols = ['t'] + state_cols + action_cols + ['r'] + state_prime_cols + [ 'absorbing' ] # Define the masks used by the action dispatcher state_mask = [i for i, s in enumerate(state_cols) if s in knn_state_cols] data_mask = [i for i, c in enumerate(cols) if c in knn_state_cols] if ad_type == 'fkdt': action_dispatcher = FixedKDTActionDispatcher alg_actions = dataset[action_cols].values elif ad_type == 'rkdt': action_dispatcher = RadialKDTActionDispatcher alg_actions = dataset[action_cols].values else: action_dispatcher = None alg_actions = None if double_fqi: fqi = DoubleFQIDriver else: fqi = FQIDriver algorithm = fqi(mdp=mdp, policy=pi, actions=alg_actions, max_iterations=max_iterations, regressor_type=regressor, data=dataset[cols].values, action_dispatcher=action_dispatcher, state_mask=state_mask, data_mask=data_mask, s_norm=kdt_norm, filter_a_outliers=filt_a_outliers, ad_n_jobs=n_jobs, ad_param=kdt_param, verbose=True, **regressor_params) print('Starting execution') algorithm.step() # save algorithm object algorithm_name = output_name + '.pkl' with open(output_path + '/' + algorithm_name, 'wb') as output: pickle.dump(algorithm, output, pickle.HIGHEST_PROTOCOL) # save action dispatcher object AD_name = 'AD_' + algorithm_name with open(output_path + '/' + AD_name, 'wb') as output: pickle.dump(algorithm._action_dispatcher, output, pickle.HIGHEST_PROTOCOL) print('Saved Action Dispatcher') if evaluation: print('Evaluation') run_evaluation(output_path + '/' + algorithm_name, track_file_name, data_path, n_jobs, output_path, 'eval_' + output_name, False, output_path + '/' + AD_name)
def test_sample(self): actions = [10, 100, 1000] pi = EpsilonGreedy(actions, TestQFunction([10, 100, 1000]), 0) self.assertEqual(1000, pi.sample_action(0)) self.assertTrue( np.linalg.norm(np.array([0., 0., 1.]) - pi(0)) < 0.0000001) pi.epsilon = 0.3 self.assertTrue( np.linalg.norm(np.array([0.1, 0.1, 0.8]) - pi(0)) < 0.0000001) pi.epsilon = 0.6 np.random.seed(0) self.assertEqual(100, pi.sample_action(0)) pi = EpsilonGreedy(actions, TestQFunction([10, 1000, 100]), 0) self.assertEqual(100, pi.sample_action(0)) self.assertTrue( np.linalg.norm(np.array([0., 1., 0.]) - pi(0)) < 0.0000001) pi.epsilon = 0.3 self.assertTrue( np.linalg.norm(np.array([0.1, 0.8, 0.1]) - pi(0)) < 0.0000001) pi.epsilon = 0.6 np.random.seed(0) self.assertEqual(100, pi.sample_action(0)) pi = EpsilonGreedy(actions, TestQFunction([1000, 100, 10]), 0) self.assertEqual(10, pi.sample_action(0)) self.assertTrue( np.linalg.norm(np.array([1., 0., 0.]) - pi(0)) < 0.0000001) pi.epsilon = 0.3 self.assertTrue( np.linalg.norm(np.array([0.8, 0.1, 0.1]) - pi(0)) < 0.0000001) pi.epsilon = 0.6 np.random.seed(0) self.assertEqual(100, pi.sample_action(0))
'n_jobs': 1 } # FIXME: max_iterations = 10 # FIXME: batch_size = 10 """ --- FQI --- """ ################ TRAIN ################## filename = 'TRAIN_' + str(train_days) + ' days - ' + str( year_train) + ' - ' + str(minsplit_opt) + ' ms_' + str( max_iterations) + ' it' + '_fs' + str(len(fs)) print(filename) target_mdp_train = target_mdp_train_1 # FIXME: change mdp n_days_train = train_days epsilon = 0 pi = EpsilonGreedy(actions, ZeroQ(), epsilon) #type(pi) #dat_ = pd.read_csv('dat_fqi_train_1.csv') # FIXME: change csv #dat_ar = dat_.values #r = (dat_['REWARD']).values # REWARD #s_prime = np.column_stack(((dat_['PORTFOLIO_p']).values, (dat_['TIME_p']).values, (dat_ar[:,185:245]))) # STATE PRIME absorbing = (dat_fqi['DONE']).values # DONE #sa = np.column_stack(((dat_['PORTFOLIO']).values, (dat_['TIME']).values, (dat_ar[:,65:125]), (dat_['ACTION']).values)) # STATE ACTION algorithm = FQI(target_mdp_train, pi, verbose=True, actions=actions, batch_size=batch_size,