def __init__(self, kernel_width, seed, nRFF, n_feat): rbf_sampler = RBFSampler(gamma=kernel_width, random_state=seed, n_components=nRFF) rbf_sampler.fit(np.zeros((1, n_feat))) self.W = rbf_sampler.random_weights_ self.b = rbf_sampler.random_offset_ self.nRFF = nRFF
class KCCA(): def __init__(self, n_components=256): self.CCA = CCA(n_components) ### shape of A Nxd def fit(self, A, B): A = deepcopy(A) B = deepcopy(B) self.rbf_feature_A = RBFSampler(gamma=1, n_components=len(A)) self.rbf_feature_B = RBFSampler(gamma=1, n_components=len(B)) self.rbf_feature_A.fit(A) self.rbf_feature_B.fit(B) A = self.rbf_feature_A.transform(A) B = self.rbf_feature_B.transform(B) self.CCA.fit(A, B) def transform_a(self, A): A = deepcopy(A) A = self.rbf_feature_A.transform(A) return self.CCA.transform_a(A) def transform_b(self, B): B = deepcopy(B) B = self.rbf_feature_B.transform(B) return self.CCA.transform_b(B)
def computeKernelMatrix(self, Graphs): print "Computing gram matrix" #self.treekernelfunction=tree_kernels_STonlyroot_FeatureVector.STKernel(self.Lambda, labels=self.labels,veclabels=self.veclabels,order=self.order) #Preprocessing step: approximation of RBF with explicit features. #Add a field to every node "veclabel_explicit_rbf" labels = set() for g in Graphs: for _, d in g.nodes(data=True): #print d labels.add(tuple(d['veclabel'])) #print len(labels) labels_list = [list(l) for l in labels] ## labels=set() # labels_list=[] # for g in Graphs: # for _,d in g.nodes(data=True): # #print d # labels_list.append(list(d['veclabel'])) #print len(labels) #labels_list=[list(l) for l in labels] print "Size of labels matrix:", len(labels_list), len(labels_list[0]) feature_map_fourier = RBFSampler(gamma=(1.0 / len(labels_list[0])), random_state=1, n_components=self.n_comp) #feature_map_fourier = Nystroem(gamma=(1.0/len(labels_list[0])), random_state=1,n_components=250) feature_map_fourier.fit(labels_list) for g in Graphs: for n, d in g.nodes(data=True): g.node[n]['veclabel_rbf'] = feature_map_fourier.transform( d['veclabel'])[0] #.tolist() print "RBF approximation finished." #print Graphs[0].node[1]['veclabel_rbf'] Gram = np.empty(shape=(len(Graphs), len(Graphs))) progress = 0 FeatureMaps = [] for i in xrange(0, len(Graphs)): FeatureMaps.append( self.generateGraphFeatureMap(Graphs[i], self.max_radius)) print "FeatureVectors calculated" for i in xrange(0, len(Graphs)): for j in xrange(i, len(Graphs)): #print i,j progress += 1 Gram[i][j] = self._kernelFunctionFeatureVectors( FeatureMaps[i], FeatureMaps[j]) Gram[j][i] = Gram[i][j] if progress % 1000 == 0: print "k", sys.stdout.flush() elif progress % 100 == 0: print ".", sys.stdout.flush() return Gram
class RKHSfunction(): # easier to use with random features def __init__(self, kernel_gamma, seed=1, n_feat=96): self.kernel_gamma = kernel_gamma self.n_feat = n_feat self.model = nn.Sequential( Flatten(), nn.Linear(n_feat, 1, bias=True) ) # random feature. the param of this models are the weights, i.e., decision var self.seed = seed self.rbf_feature = RBFSampler( gamma=kernel_gamma, n_components=n_feat, random_state=seed) # only support Gaussian RKHS for now # self.rbf_feature.fit(X_example.view(X_example.shape[0], -1)) def eval(self, X, fit=False): x_reshaped = (X.view(X.shape[0], -1)) if fit: self.rbf_feature.fit( x_reshaped) # only transform during evaluation if not x_reshaped.requires_grad: # x_feat = self.rbf_feature.fit_transform(x_reshaped) x_feat = self.rbf_feature.transform( x_reshaped) # only transform during evaluation rkhsF = self.model(torch.from_numpy(x_feat).float()) else: # raise NotImplementedError # print('need a pth impl of fit transform') # internally: self.fit(X, **fit_params).transform(X) x_detach = x_reshaped.detach() x_fitted = self.rbf_feature.fit(x_detach, y=None) # x_fitted.transform(x_reshaped) x_feat = pth_transform(x_fitted, x_reshaped) # assert torch.max(x_feat.detach() - self.rbf_feature.fit_transform(x_detach)) == 0 # there's randomness of course rkhsF = self.model(x_feat)[:, 0] return rkhsF def norm(self): return computeRKHSNorm(self.model) def set_seed(self, seed): # set the seed of RF. such as in doubly SGD self.seed = seed self.rbf_feature = RBFSampler( gamma=self.kernel_gamma, n_components=self.n_feat, random_state=seed) # only support Gaussian RKHS for now def __call__(self, X, fit=False, random_state=False): if random_state is True: self.set_seed( seed=np.random) # do a random seed reset for doubly stochastic # else: # self.set_seed(seed=1) return self.eval(X, fit)
def fit(self, X, y=None): RBFSampler.fit(self, X=X, y=y) for i_pass in range(self.n_pass): IntLoss = numpy.zeros((self.n_components, 1)) EnLoss = numpy.zeros((self.n_components, 1)) for comp in range(self.n_components): if self.verbose: print("COMPONENT %d, " % comp, end="") indices_minibatch = numpy.random.choice( X.shape[0], self.minibatch_size) minibatch = X[indices_minibatch] gram_minibatch = rbf_kernel(minibatch, gamma=self.gamma) phi = self.transform(minibatch) diff_mat = gram_minibatch - numpy.dot(phi, phi.T) n_iter = 0 err = numpy.inf IntLoss[comp] = self.loss_function(minibatch, gram_minibatch) if self.verbose: print('Intial Loss', IntLoss[comp]) while err > self.tol and n_iter < self.max_iter: w_old = self.random_weights_[:, comp].copy() # phi = self.transform(minibatch) # diff_mat = gram_minibatch - numpy.dot(phi, phi.T) wx_b = numpy.dot(minibatch, self.random_weights_[:, comp] ) + self.random_offset_[comp] sin_wx = numpy.sin(wx_b).reshape((-1, 1)) cos_wx = numpy.cos(wx_b).reshape((-1, 1)) sin_cos = numpy.dot(sin_wx, cos_wx.T) * 2 / ( self.n_components * self.minibatch_size**2) diff_sin_cos = numpy.diag( numpy.dot(diff_mat, 2. * sin_cos.T)).reshape((-1, 1)) dl_dw = numpy.sum(diff_sin_cos * minibatch, axis=0) self.random_weights_[:, comp] -= (self.alpha) * ( self.lbda * self.random_weights_[:, comp] + dl_dw) if self.update_b: dl_db = numpy.sum(diff_sin_cos) self.random_offset_[comp] -= self.alpha * dl_db err = numpy.linalg.norm(w_old - self.random_weights_[:, comp]) n_iter += 1 EnLoss[comp] = self.loss_function(minibatch, gram_minibatch) if self.verbose: print("%d iterations" % n_iter) print('End Loss', EnLoss[comp]) time.sleep(2) self.intial_loss = IntLoss self.end_loss = EnLoss return self
def test_classifier_regularization(normalize, loss): rng = np.random.RandomState(0) transformer = RBFSampler(n_components=100, random_state=0, gamma=10) transformer.fit(X) X_trans = transformer.transform(X) if normalize: X_trans = StandardScaler().fit_transform(X_trans) y, coef = generate_target(X_trans, rng, -0.1, 0.1) y_train = y[:n_train] y_test = y[n_train:] y_train = np.sign(y_train) y_test = np.sign(y_test) # overfitting clf = SGDClassifier(transformer, max_iter=500, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=0.00001, intercept_decay=1e-10, random_state=0, tol=0, normalize=normalize) clf.fit(X_train[:100], y_train[:100]) train_acc = clf.score(X_train[:100], y_train[:100]) assert train_acc >= 0.95 # underfitting clf_under = SGDClassifier(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=10000, random_state=0, normalize=normalize) clf_under.fit(X_train, y_train) assert np.sum(clf_under.coef_**2) < np.sum(clf.coef_**2) # l1 regularization clf_l1 = SGDClassifier(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=1000, l1_ratio=0.9, random_state=0, normalize=normalize) clf_l1.fit(X_train, y_train) assert_almost_equal(np.sum(np.abs(clf_l1.coef_)), 0)
def prepare_feature_vector(self, n, H, n_samples): rbfs = [] samples = [self.env.observation_space.sample() for _ in range(n_samples)] scaler = StandardScaler() samples = scaler.fit_transform(samples) for i in range(n): r = RBFSampler(n_components=H, gamma=0.8 * (1 + n)) r.fit(samples) rbf = Pipeline(steps=[["scale", scaler], ["rbf", r]]) rbfs.append(rbf) self.feature_generator = FeatureUnion([["rbf-{}".format(i), rbf] for (i, rbf) in enumerate(rbfs)])
class GPSamplePath(Function): def __init__(self, seed=1): self.dim = 1 self.bounds = [[-3, 3]] self.y_bounds = [-2, 2] super().__init__(self.dim, self.bounds, seed) self.fit() self.min, self.max = self.get_min_max() res = minimize(self.value_std, self.bounds, maxf=self.dim * 1000, algmethod=1) self.x_opt = res['x'][0] self.y_opt = -self.value_std(self.x_opt) def base_function(self, x): res = (6 * x - 2)**2 * np.sin(12 * x - 4) return res def fit(self): X = np.linspace(self.bounds[0][0], self.bounds[0][1], 3) Y = np.random.uniform(self.y_bounds[0], self.y_bounds[1], 3) X = X.reshape(-1, 1) self.rbf_feature = RBFSampler(gamma=1, n_components=30) self.rbf_feature.fit(np.atleast_2d(X[0])) phi_X = self.rbf_feature.transform(X) self.w = np.linalg.inv(phi_X.T.dot(phi_X)).dot(phi_X.T).dot(Y) def get_min_max(self): X = np.linspace(self.bounds[0][0], self.bounds[0][1], 10000) Y = self.value(X) return np.min(Y), np.max(Y) def value(self, x): x = x.reshape(-1, 1) res = self.rbf_feature.transform(x).dot(self.w) return res def value_std(self, x): res = self.value(x) res = (res - self.min) / (self.max - self.min) return res def get_pool(self, K): return np.linspace(self.bounds[0][0], self.bounds[0][1], K) def plot(self): x_range = np.linspace(self.bounds[0][0], self.bounds[0][1], 100) y = self.value_std(x_range) plt.plot(x_range, y) plt.show()
def build(self, input_shape): rbf_sampler = RBFSampler( gamma=self.gamma, n_components=self.dim, random_state=self.random_state) x = np.zeros(shape=(1, self.input_dim)) rbf_sampler.fit(x) self.rff_weights = tf.Variable( initial_value=rbf_sampler.random_weights_, dtype=tf.float32, trainable=True, name="rff_weights") self.built = True
class _RBFSamplerImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_regressor_regularization(normalize, loss): rng = np.random.RandomState(0) transformer = RBFSampler(n_components=100, random_state=0, gamma=10) transformer.fit(X) X_trans = transformer.transform(X) if normalize: X_trans = StandardScaler().fit_transform(X_trans) y, coef = generate_target(X_trans, rng, -0.1, 0.1) y_train = y[:n_train] y_test = y[n_train:] # overfitting clf = SAGARegressor(transformer, max_iter=300, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=0.0001, intercept_decay=1e-6, random_state=0, tol=0, normalize=normalize) clf.fit(X_train[:100], y_train[:100]) l2 = np.mean((y_train[:100] - clf.predict(X_train[:100]))**2) assert l2 < 0.01 # underfitting clf_under = SAGARegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=100000, random_state=0, normalize=normalize) clf_under.fit(X_train, y_train) assert np.sum(clf_under.coef_ ** 2) < np.sum(clf.coef_ ** 2) # l1 regularization clf_l1 = SAGARegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=1000, l1_ratio=0.9, random_state=0, normalize=normalize) clf_l1.fit(X_train, y_train) assert_almost_equal(np.sum(np.abs(clf_l1.coef_)), 0) # comparison with sgd sgd = SGDRegressor(alpha=0.01, max_iter=100, eta0=1, learning_rate='constant', fit_intercept=True, random_state=0) sgd.fit(X_trans[:n_train], y_train) test_l2_sgd = np.mean((y_test - sgd.predict(X_trans[n_train:]))**2) clf = SAGARegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=0.01, random_state=0, normalize=normalize, ) clf.fit(X_train, y_train) test_l2 = np.mean((y_test - clf.predict(X_test))**2) assert test_l2 < test_l2_sgd
def fit(self, X, y=None): RBFSampler.fit(self, X=X, y=y) X_copy = X.copy() Ndata = numpy.shape(X_copy)[0] Xdim = numpy.shape(X_copy)[1] # if self.update_b is True: # WRD = numpy.concatenate((self.random_weights_, self.random_offset_[numpy.newaxis,:]), axis = 0) # X_copy = numpy.concatenate((X_copy, numpy.ones((Ndata,1), dtype= numpy.float64)), axis=1) # else: # WRD = self.random_weights_ # WRD = self.random_weights_ # WRD = numpy.random.randn(Xdim,self.n_components) # Q = numpy.linalg.qr(WRD, mode='raw')[0] # S = numpy.sqrt(numpy.random.chisquare(Xdim, Xdim)) # weights = numpy.diag(S).dot(Q.T) sigma = numpy.sqrt(1 / (2 * self.gamma)) # self.random_weights_ = weights # # reps = int(numpy.ceil(self.n_components / Xdim)) # Q = numpy.empty((Xdim, Xdim*reps)) # # for r in range(reps): # #W = self.random_weights_ # #W = self._random.randn(Xdim, Xdim) # W = numpy.random.randn(Xdim, Xdim) # Q[:, (r * Xdim):((r + 1) * Xdim)] = numpy.linalg.qr(W)[0] # # S = numpy.sqrt(numpy.random.chisquare(df=Xdim, size=Xdim)) # weights = numpy.diag(S).dot(Q[:, :self.n_components]) # sigma = numpy.sqrt(2 * self.gamma) # #self.random_weights_ = (1/sigma)*weights # #self.random_weights_ = numpy.sqrt(2*sigma)*weights or_rbf = OrthogonalRBF(Xdim=Xdim, nbases=self.n_components, lenscale=sigma, random_state=self.random_state) self.weights = or_rbf.W self.offset = numpy.random.rand(self.n_components) * (2 * numpy.pi) self.random_weights_ = or_rbf.W / sigma #self. random_offset_ = 3.0 return self
class RBFSamplerImpl(): def __init__(self, gamma=1.0, n_components=100, random_state=None): self._hyperparams = { 'gamma': gamma, 'n_components': n_components, 'random_state': random_state } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
class ValueFunction(object): """ Value Funciton approximator. """ def __init__(self): # sampleing envrionment state in order to featurize it. state_samples = np.array( [env.observation_space.sample() for x in range(10000)]) # Standardize features by removing the mean and scaling to unit variance self.scaler = StandardScaler() self.scaler.fit(state_samples) scaler_samples = scaler.transform(state_samples) # Approximates feature map of an RBF kernel # by Monte Carlo approximation of its Fourier transform. self.featurizer_state = RBFSampler(gamma=0.5, n_components=100) self.featurizer_state.fit(scaler_samples) # action model for SGD regressor self.action_models = [] nA = env.action_space.n for na in range(nA): # Linear classifiers with SGD training. model = SGDRegressor(learning_rate="constant") model.partial_fit([self.__featurize_state(env.reset())], [0]) self.action_models.append(model) # print(self.action_models) def __featurize_state(self, state): scaler_state = self.scaler.transform([state]) return self.featurizer_state.transform(scaler_state)[0] def predict(self, state): curr_features = self.__featurize_state(state) action_probs = np.array( [m.predict([curr_features])[0] for m in self.action_models]) # print(action_probs) return action_probs def update(self, state, action, y): curr_features = self.__featurize_state(state) self.action_models[action].partial_fit([curr_features], [y])
class Model: def __init__(self, grid): # fit the featurizer to data samples = gather_samples(grid) # self.featurizer = Nystroem() self.featurizer = RBFSampler() self.featurizer.fit(samples) dims = self.featurizer.n_components # initialize linear model weights self.w = np.zeros(dims) def predict(self, s): x = self.featurizer.transform([s])[0] return x @ self.w def grad(self, s): x = self.featurizer.transform([s])[0] return x
def fit(self, X, y=None): RBFSampler.fit(self, X=X, y=y) Xdim = numpy.shape(X)[0] #or_rbf = OrthogonalRBF(Xdim= Xdim, nbases=self.n_components, lenscale = self.gamma, #random_state= self.random_state) #self. random_weights_ = or_rbf.W #self. random_offset_ = 3.0 WRD = self.random_weights_ Q = numpy.linalg.qr(WRD)[0] S = numpy.sqrt(numpy.random.chisquare(Xdim, Xdim)) weights = numpy.diag(S).dot(Q) sigma = numpy.sqrt(2 * self.gamma) self.random_weights_ = numpy.sqrt(2 * sigma) * weights # #if self.update_b is True: # WRD = numpy.concatenate((self.random_weights_, self.random_offset_[numpy.newaxis,:]), axis = 0) # X_copy = numpy.concatenate((X_copy, numpy.ones((Ndata,1), dtype= numpy.float64)), axis=1) # else: # WRD = self.random_weights return self
class LinearRBF(Policy): ''' RBF features ''' def __init__(self, state_dim, action_dim, number_of_features): Policy.__init__(self, state_dim, action_dim) self.rbf_feature = RBFSampler(gamma=25., n_components=number_of_features) self.rbf_feature.fit(np.random.randn(action_dim, state_dim)) def set_theta(self, theta): self.theta = theta def get_action(self, state): features = self.rbf_feature.transform(state.reshape(1, -1)) action = features @ self.theta[:-self.action_dim].reshape( -1, self.action_dim) action = action + self.theta[-self.action_dim:] return action.reshape(-1) def get_number_of_parameters(self): return self.rbf_feature.get_params().get( "n_components") * self.action_dim + self.action_dim
def test_feature_map_equals_scikit_learn(): sigma = 2. gamma = sigma**2 N = 10 D = 20 m = 3 X = np.random.randn(N, D) np.random.seed(1) omega = sigma * np.random.randn(D, m) u = np.random.uniform(0, 2 * np.pi, m) # make sure basis is the same np.random.seed(1) rbf_sampler = RBFSampler(gamma, m, random_state=1) rbf_sampler.fit(X) assert_allclose(rbf_sampler.random_weights_, omega) assert_allclose(rbf_sampler.random_offset_, u) phi_scikit = rbf_sampler.transform(X) phi_mine = feature_map(X, omega, u) assert_allclose(phi_scikit, phi_mine)
class Model: def __init__(self, grid): # fit the featurizer to data samples = gather_samples(grid) # self.featurizer = Nystroem() self.featurizer = RBFSampler() self.featurizer.fit(samples) dims = self.featurizer.n_components # initialize linear model weights self.w = np.zeros(dims) def predict(self, s, a): sa = merge_state_action(s, a) x = self.featurizer.transform([sa])[0] return x @ self.w def predict_all_actions(self, s): return [self.predict(s, a) for a in ALL_POSSIBLE_ACTIONS] def grad(self, s, a): sa = merge_state_action(s, a) x = self.featurizer.transform([sa])[0] return x
class Model: def __init__(self, env): # fit the featurizer to data self.env = env samples = gather_samples(env) self.featurizer = RBFSampler() self.featurizer.fit(samples) dims = self.featurizer.n_components # initialize linear model weights self.w = np.zeros(dims) def predict(self, s, a): sa = np.concatenate((s, [a])) x = self.featurizer.transform([sa])[0] return x @ self.w def predict_all_actions(self, s): return [self.predict(s, a) for a in range(self.env.action_space.n)] def grad(self, s, a): sa = np.concatenate((s, [a])) x = self.featurizer.transform([sa])[0] return x
def _test_learning_kernel_with_random_feature(divergence, trans=None, rho=1): if trans is None: trans = RBFSampler(gamma=1, random_state=0) trans.set_params(n_components=128) X_trans = trans.fit_transform(X) score = kernel_alignment(np.dot(X_trans, X_trans.T), y, False) lkrf = LearningKernelwithRandomFeature(trans, warm_start=False, divergence=divergence, eps_abs=1e-6, eps_rel=1e-6, max_iter=100, rho=rho) X_trans = lkrf.fit_transform(X, y) score_lkrf = kernel_alignment(np.dot(X_trans, X_trans.T), y, False) assert score_lkrf >= score assert_almost_equal(np.sum(lkrf.importance_weights_), 1) assert np.min(lkrf.importance_weights_) >= 0 # weak constrain: rho = 10*rho trans.fit(X) lkrf = LearningKernelwithRandomFeature(trans, warm_start=False, divergence=divergence, eps_abs=1e-6, eps_rel=1e-6, max_iter=100, rho=rho*20) X_trans = lkrf.fit_transform(X, y) score_lkrf_weak = kernel_alignment(np.dot(X_trans, X_trans.T), y, False) print(score_lkrf_weak, score_lkrf, score) assert score_lkrf_weak >= score_lkrf # remove bases n_nz = np.sum(lkrf.importance_weights_ != 0) print(n_nz) if lkrf.remove_bases(): X_trans_removed = lkrf.transform(X) assert_almost_equal(X_trans_removed.shape[1], n_nz) indices = np.nonzero(lkrf.importance_weights_)[0] assert_almost_equal(X_trans_removed, X_trans[:, indices])
class PCPGAgent(BaseAgent): def __init__(self, config): BaseAgent.__init__(self, config) self.config = config self.task = config.task_fn() self.network, self.optimizer, self.replay_buffer, self.density_model = dict(), dict(), dict(), dict() self.replay_buffer_actions = dict() self.replay_buffer_infos = dict() # create policy networks for explore, exploit and rollin phases for mode in ['explore', 'exploit', 'rollin']: self.network[mode] = config.network_fn() self.replay_buffer[mode] = [] self.replay_buffer_actions[mode] = [] self.replay_buffer_infos[mode] = [] self.optimizer['explore'] = config.optimizer_fn(self.network['explore'].parameters()) self.optimizer['exploit'] = config.optimizer_fn(self.network['exploit'].parameters()) self.total_steps = 0 self.states = self.task.reset() self.states = config.state_normalizer(self.states) # list to store policies in the policy cover self.policy_mixture = [copy.deepcopy(self.network['explore'].state_dict())] # each policy will have its own optimizer self.policy_mixture_optimizers = [copy.deepcopy(self.optimizer['explore'].state_dict())] # weights among the policies in the cover, which is uses to sample self.policy_mixture_weights = torch.tensor([1.0]) self.policy_mixture_returns = [] self.timestamp = None # define exploration reward bonus if self.config.bonus == 'rnd': # RND bonus self.rnd_network = FCBody(self.config.state_dim).to(Config.DEVICE) self.rnd_pred_network = FCBody(self.config.state_dim).to(Config.DEVICE) self.rnd_optimizer = torch.optim.RMSprop(self.rnd_pred_network.parameters(), 0.001) elif self.config.bonus == 'randnet-kernel-s': # random network kernel mapping states to features if self.config.game == 'maze': self.kernel = ConvFCBodyMaze(size=config.maze_size, in_channels = 3, phi_dim = self.config.phi_dim).to(Config.DEVICE) else: self.kernel = FCBody(self.config.state_dim, hidden_units=(self.config.phi_dim, self.config.phi_dim)).to(Config.DEVICE) elif self.config.bonus == 'rbf-kernel': # RBF kernel self.rbf_feature = RBFSampler(gamma=1, random_state=1, n_components=self.config.phi_dim) if isinstance(self.task.action_space, Box): self.rbf_feature.fit(X = np.random.randn(5, self.config.state_dim + self.config.action_dim)) else: self.rbf_feature.fit(X = np.random.randn(5, self.config.state_dim + 1)) if isinstance(self.task.action_space, Box): self.uniform_prob = self.continous_uniform_prob() else: self.uniform_prob = 1./self.config.action_dim # takes as input a minibatch of states (and possibly actions), returns exploration reward for each def compute_reward_bonus(self, states, actions = None): if self.config.bonus == 'rnd': states = torch.from_numpy(states).float().to(Config.DEVICE) rnd_target = self.rnd_network(states).detach() rnd_pred = self.rnd_pred_network(states).detach() rnd_loss = F.mse_loss(rnd_pred, rnd_target, reduction='none').mean(1) reward_bonus = rnd_loss.cpu().numpy() elif 'randnet-kernel' in self.config.bonus: phi = self.compute_kernel(tensor(states), actions) reward_bonus = torch.sqrt((torch.mm(phi, self.density_model) * phi).sum(1)).detach() elif 'rbf-kernel' in self.config.bonus: assert actions is not None phi = self.compute_kernel(tensor(states), tensor(actions)) reward_bonus = torch.sqrt((torch.mm(phi, self.density_model) * phi).sum(1)).detach() elif 'id-kernel' in self.config.bonus: phi = self.compute_kernel(tensor(states), actions) reward_bonus = torch.sqrt((torch.mm(phi, self.density_model) * phi).sum(1)).detach() elif 'counts' in self.config.bonus: # can use ground truth counts in combolock for debugging reward_bonus = [] for s in self.config.state_normalizer(states): s = tuple(s) if not s in self.density_model['explore'].keys(): cnts = 0 else: cnts = self.density_model['explore'][s] if self.config.bonus == 'counts': reward_bonus.append(1.0/(1.0 + cnts)) elif self.config.bonus == 'counts-sqrt': reward_bonus.append(1.0/math.sqrt(1.0 + cnts)) reward_bonus = np.array(reward_bonus) return reward_bonus def time(self, tag=''): if self.time is None or tag=='reset': self.timestamp = time() else: t = time() print(f'{tag} took {t - self.timestamp:.4f}s') self.timestamp = t # gather trajectories following a policy and return them in a buffer. # explore mode uses exploration bonus as reward, exploit uses environment reward # can specify whether to roll in using policy mixture, or instead use the latest policy def gather_trajectories(self, roll_in=True, add_bonus_reward=True, debug=False, mode=None, record_return=False): config = self.config states = self.states network = self.network[mode] roll_in_length = 0 if (debug or not roll_in) else random.randint(0, config.horizon - 1) roll_out_length = config.horizon - roll_in_length storage = Storage(roll_out_length) if roll_in_length > 0: assert roll_in # Sample previous policy to roll in i = torch.multinomial(self.policy_mixture_weights.cpu(), num_samples=1) self.network['rollin'].load_state_dict(self.policy_mixture[i]) # Roll in for _ in range(roll_in_length): prediction = self.network['rollin'](states) next_states, rewards, terminals, info = self.task.step(to_np(prediction['a'])) if self.config.game == 'maze': for i in info: self.unique_pos.add(tuple(i['agent_pos'])) next_states = config.state_normalizer(next_states) states = next_states self.total_steps += config.num_workers # Roll out for i in range(roll_out_length): if i == 0 and roll_in: #if roll-in is false, then we ignore epsilon greedy and simply roll-out the current policy # we are using \hat{\pi} sample_eps_greedy = random.random() < self.config.eps if sample_eps_greedy: if isinstance(self.task.action_space, Discrete): actions = torch.randint(self.config.action_dim, (states.shape[0],)).to(Config.DEVICE) elif isinstance(self.task.action_space, Box): actions = self.uniform_sample_cont_random_acts(states.shape[0]) prediction = network(states, tensor(actions)) else: prediction = network(states) #update the log_prob_a by including the epsilon_greed prediction['log_pi_a'] = (prediction['log_pi_a'].exp() * (1.-self.config.eps) + self.config.eps*self.uniform_prob).log() else: # we are using \pi prediction = network(states) next_states, rewards, terminals, info = self.task.step(to_np(prediction['a'])) if self.config.game == 'maze': for i in info: self.unique_pos.add(tuple(i['agent_pos'])) if add_bonus_reward: s = config.state_normalizer(states) reward_bonus = self.config.reward_bonus_normalizer(self.compute_reward_bonus(s,to_np(prediction['a']))) rewards = self.config.bonus_coeff*self.config.horizon*reward_bonus assert(all(rewards >= 0)) if record_return: self.record_online_return(info) rewards = config.reward_normalizer(rewards) next_states = config.state_normalizer(next_states) storage.add(prediction) storage.add({'r': tensor(rewards).unsqueeze(-1), 'm': tensor(1 - terminals).unsqueeze(-1), 'i': list(info), 's': tensor(states)}) states = next_states self.total_steps += config.num_workers # assert(np.array(terminals).all()) # debug self.states = states prediction = network(states) storage.add(prediction) storage.placeholder() advantages = tensor(np.zeros((config.num_workers, 1))) returns = prediction['v'].detach() for i in reversed(range(roll_out_length)): returns = storage.r[i] + config.discount * storage.m[i] * returns if not config.use_gae: advantages = returns - storage.v[i].detach() else: td_error = storage.r[i] + config.discount * storage.m[i] * storage.v[i + 1] - storage.v[i] advantages = advantages * config.gae_tau * config.discount * storage.m[i] + td_error storage.adv[i] = advantages.detach() storage.ret[i] = returns.detach() return storage def log(self, s): logtxt(self.logger.log_dir + '.txt', s, show=True, date=False) # compute the mapping from states (and possibly actions) to features def compute_kernel(self, states, actions = None): actions_one_hot = tensor(np.eye(self.config.action_dim)[actions]) # state_actions = torch.cat((tensor(states).to(Config.DEVICE), actions_one_hot), dim=1) if self.config.bonus == 'randnet-kernel-s': phi = F.normalize(self.kernel(tensor(states).to(Config.DEVICE)), p=2, dim=1) elif self.config.bonus == 'randnet-kernel-sa': phi = F.normalize(self.kernel(state_actions), p=2, dim=1) elif self.config.bonus == 'id-kernel-s': phi = states.to(Config.DEVICE) elif self.config.bonus == 'id-kernel-sa': phi = state_actions elif self.config.bonus == 'rbf-kernel': assert actions is not None if actions is None: phi = self.rbf_feature.transform(states.cpu().numpy()) phi = torch.tensor(phi).to(Config.DEVICE) else: #concatenate state and action features np_states = states.cpu().numpy() np_actions = actions.cpu().numpy() if isinstance(self.task.action_space, Discrete): np_actions = np.expand_dims(np_actions, axis = 1) assert np_actions.ndim == 2 and np_actions.shape[0] == np_states.shape[0] states_acts_cat = np.concatenate((np_states, self.clip_actions(np_actions)), axis = 1) phi = self.rbf_feature.transform(states_acts_cat) phi = torch.tensor(phi).to(Config.DEVICE) else: raise NotImplementedError return phi # for visualizing visitations in combolock def log_visitations(self, visitations): self.log('lock1') self.log(np.around(visitations[0], 3)) self.log('lock2') self.log(np.around(visitations[1], 3)) # turn count-based density model into visitation table def compute_state_visitations(self, density_model, use_one_hot=False): locks = [np.zeros((3, self.config.horizon-1)), np.zeros((3, self.config.horizon-1))] N = sum(list(density_model.values())) for state in density_model.keys(): if use_one_hot: k = np.argmax(state) (s, l, h) = np.unravel_index(k , (3, 3, self.config.horizon)) if l in [0, 1]: locks[l][s][h] += float(density_model[state]) / N else: if not all(np.array(state)==0.0): s = np.argmax(state[:3]) l = int(state[-1]) h = np.argmax(state[3:-1]) locks[l][s][h] += float(density_model[state]) / N return locks # update the density model using data from replay buffer. # also computes covariance matrices for kernel case. def update_density_model(self, mode=None): replay_buffer = self.replay_buffer[mode] replay_buffer_act = self.replay_buffer_actions[mode] states = torch.cat(sum(replay_buffer, [])) actions = torch.cat(sum(replay_buffer_act,[])) if self.config.bonus == 'rnd': states = states.to(Config.DEVICE) targets = self.rnd_network(states).detach() data = DataLoader(TensorDataset(states, targets), batch_size = 100, shuffle=True) for i in range(1): total_loss = 0 losses = [] for j, batch in enumerate(data): self.rnd_optimizer.zero_grad() pred = self.rnd_pred_network(batch[0]) loss = F.mse_loss(pred, batch[1], reduction='none') (loss.mean()).backward() self.rnd_optimizer.step() total_loss += loss.mean().item() losses.append(loss) print(f'[RND loss: {total_loss / j:.5f}]') bonuses = torch.cat(losses).view(-1) elif self.config.bonus == 'rbf-kernel': N = states.shape[0] ind = np.random.choice(N, min(2000, N), replace=False) pdists = scipy.spatial.distance.pdist((states.cpu().numpy())[ind]) self.rbf_feature.gamma = 1./(np.median(pdists)**2) phi = self.compute_kernel(states, actions = actions) n, d = phi.shape sigma = torch.mm(phi.t(), phi) + self.config.ridge*torch.eye(d).to(Config.DEVICE) self.density_model = torch.inverse(sigma).detach() covariance_matrices = [] assert len(replay_buffer) == len(replay_buffer_act) for i in range(len(replay_buffer)): states = torch.cat(replay_buffer[i]) actions = torch.cat(replay_buffer_act[i]) phi = self.compute_kernel(states,actions) n, d = phi.shape sigma = torch.mm(phi.t(), phi) + self.config.ridge*torch.eye(d).to(Config.DEVICE) covariance_matrices.append(sigma.detach()) m = 0 for matrix in covariance_matrices: m = max(m, matrix.max()) covariance_matrices = [matrix / m for matrix in covariance_matrices] elif 'kernel' in self.config.bonus: N = states.shape[0] phi = self.compute_kernel(states, actions) n, d = phi.shape sigma = torch.mm(phi.t(), phi) + self.config.ridge*torch.eye(d).to(Config.DEVICE) self.density_model = torch.inverse(sigma).detach() covariance_matrices = [] assert len(replay_buffer) == len(replay_buffer_act) for i in range(len(replay_buffer)): states = torch.cat(replay_buffer[i]) actions = torch.cat(replay_buffer_act[i]) phi = self.compute_kernel(states, actions) n, d = phi.shape sigma = torch.mm(phi.t(), phi) + self.config.ridge*torch.eye(d).to(Config.DEVICE) covariance_matrices.append(sigma.detach().cpu()) m = 0 for matrix in covariance_matrices: m = max(m, matrix.max()) covariance_matrices = [matrix / m for matrix in covariance_matrices] elif 'counts' in self.config.bonus: states = [tuple(s) for s in states.numpy()] unique_states = list(set(states)) self.density_model[mode] = dict(zip(unique_states, [0] * len(unique_states))) for s in states: self.density_model[mode][s] += 1 bonuses = torch.tensor([1.0/self.density_model[mode][s] for s in states]) covariance_matrices, visitations = [], [] for i, states in enumerate(replay_buffer): states = [tuple(s) for s in torch.cat(states).numpy()] density_model = dict(zip(unique_states, [0] * len(unique_states))) for s in states: density_model[s] += 1 sums=torch.tensor([density_model[s] for s in unique_states]).float() covariance_matrices.append(torch.diag(sums) + torch.eye(len(unique_states))) visitations.append(self.compute_state_visitations(density_model)) m = 0 for matrix in covariance_matrices: m = max(m, matrix.max()) covariance_matrices = [matrix / m for matrix in covariance_matrices] if mode == 'explore': self.optimize_policy_mixture_weights(covariance_matrices) # for combolock, compute the visitations for each policy if 'combolock' in self.config.game: visitations = [] states = torch.cat(sum(replay_buffer, [])) states = [tuple(s) for s in states.numpy()] unique_states = list(set(states)) for i, states in enumerate(self.replay_buffer[mode]): states = [tuple(s) for s in torch.cat(states).numpy()] density_model = dict(zip(unique_states, [0] * len(unique_states))) for s in states: density_model[s] += 1 # visitations.append(self.compute_state_visitations(self.replay_buffer_infos[mode][i])) visitations.append(self.compute_state_visitations(density_model)) if mode == 'explore': weighted_visitations = [np.zeros((3, self.config.horizon - 1)), np.zeros((3, self.config.horizon - 1))] for i in range(len(visitations)): weighted_visitations[0] += self.policy_mixture_weights[i].item()*visitations[i][0] weighted_visitations[1] += self.policy_mixture_weights[i].item()*visitations[i][1] for i in range(len(visitations)): self.log(f'\nstate visitations for policy {i}:') self.log_visitations(visitations[i]) self.log(f'\nstate visitations for weighted policy mixture:') self.log_visitations(weighted_visitations) elif mode == 'exploit': self.log(f'\nstate visitations for exploit policy:') self.log_visitations(visitations[-1]) self.reward_bonus_normalizer= RescaleNormalizer() # optimize policy mixture weights using log-determinant loss def optimize_policy_mixture_weights(self, covariance_matrices): d = covariance_matrices[0].shape[0] N = len(covariance_matrices) if N == 1: self.policy_mixture_weights = torch.tensor([1.0]) else: self.log_alphas = nn.Parameter(torch.randn(N)) opt = torch.optim.Adam([self.log_alphas], lr=0.001) for i in range(5000): opt.zero_grad() sigma_weighted_sum = torch.zeros(d, d) for n in range(N): sigma_weighted_sum += F.softmax(self.log_alphas, dim=0)[n]*covariance_matrices[n] loss = -torch.logdet(sigma_weighted_sum) if math.isnan(loss.item()): pdb.set_trace() if not i % 500: print(f'optimizing log det, loss={loss.item()}') loss.backward() opt.step() with torch.no_grad(): self.policy_mixture_weights = F.softmax(self.log_alphas, dim=0) self.log(f'\npolicy mixture weights: {self.policy_mixture_weights.numpy()}') # roll out using explore/exploit policies and store data in replay buffer def update_replay_buffer(self): print('[gathering trajectories for replay buffer]') for mode in ['explore', 'exploit']: states, actions, returns, infos = [], [], [], [] for _ in range(self.config.n_rollouts_for_density_est): new_traj = self.gather_trajectories(roll_in=False, add_bonus_reward=False, mode=mode, record_return=(mode=='exploit')) states += new_traj.cat(['s']) returns += new_traj.cat(['r']) actions += new_traj.cat(['a']) #append actions as well infos += new_traj.i mean_return = torch.cat(returns).cpu().mean()*self.config.horizon if mode == 'explore': self.policy_mixture_returns.append(mean_return.item()) self.log(f'[policy mixture returns: {np.around(self.policy_mixture_returns, 3)}]') states = [s.cpu() for s in states] print(f'return ({mode}): {mean_return}') self.replay_buffer[mode].append(states) actions = [a.cpu() for a in actions] self.replay_buffer_actions[mode].append(actions) self.replay_buffer_infos[mode].append(sum(infos, [])) # optimize explore and/or exploit policies def optimize_policy(self): for mode in ['explore', 'exploit']: if mode == 'exploit' and self.epoch < self.config.start_exploit: continue for i in range(self.config.n_policy_loops): rewards = self.step_optimize_policy(mode=mode) if not i % 5: print(f'[optimizing policy ({mode}), step {i}, mean return: {rewards.mean():.5f}]') self.policy_mixture.append(copy.deepcopy(self.network['explore'].state_dict())) self.policy_mixture_optimizers.append(copy.deepcopy(self.optimizer['explore'].state_dict())) print(f'{len(self.policy_mixture)} policies in mixture') def initialize_new_policy(self, mode): self.network[mode] = self.config.network_fn() self.optimizer[mode] = self.config.optimizer_fn(self.network[mode].parameters()) # gather a batch of data and perform some policy optimization steps def step_optimize_policy(self, mode=None): config = self.config network = self.network[mode] optimizer = self.optimizer[mode] states, actions, rewards, log_probs_old, returns, advantages = [], [], [], [], [], [] self.time('reset') # gather the trajectories for i in range(self.config.n_traj_per_loop): #some fraction of the time, we roll-in from the policy itself (so no data is wasted), half of the time from mixture coin = np.random.rand() if coin <= (1.0-self.config.proll): #simply roll-in with the policy itself, not from mixture: traj = self.gather_trajectories(add_bonus_reward=(mode=='explore'), mode=mode, roll_in = False) else: #from mixture traj = self.gather_trajectories(add_bonus_reward=(mode=='explore'), mode=mode, roll_in = True) states += traj.cat(['s']) actions += traj.cat(['a']) log_probs_old += traj.cat(['log_pi_a']) returns += traj.cat(['ret']) rewards += traj.cat(['r']) advantages += traj.cat(['adv']) # self.time('gathering trajectories') states = torch.cat(states, 0) actions = torch.cat(actions, 0) log_probs_old = torch.cat(log_probs_old, 0) returns = torch.cat(returns, 0) rewards = torch.cat(rewards, 0) advantages = torch.cat(advantages, 0) assert states.shape[0] == actions.shape[0] == rewards.shape[0] == advantages.shape[0] == returns.shape[0] actions = actions.detach() log_probs_old = log_probs_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() self.time('reset') # optimize the policy using the gathered trajectories using PPO objective for _ in range(config.optimization_epochs): sampler = random_sample(np.arange(states.size(0)), config.mini_batch_size) for batch_indices in sampler: batch_indices = tensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_probs_old = log_probs_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] prediction = network(sampled_states, sampled_actions) ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp(1.0 - self.config.ppo_ratio_clip, 1.0 + self.config.ppo_ratio_clip) * sampled_advantages policy_loss = -torch.min(obj, obj_clipped).mean() - config.entropy_weight * prediction['ent'].mean() value_loss = 0.5 * (sampled_returns - prediction['v']).pow(2).mean() optimizer.zero_grad() (policy_loss + value_loss).backward() nn.utils.clip_grad_norm_(network.parameters(), config.gradient_clip) optimizer.step() # self.time('optimizing policy') return rewards.mean() # we clip the actions since the policy uses Gaussian distribution to sample actions # in the continuous case. This avoids the policy generating large actions to maximize # the negative log-det. def clip_actions(self, actions): #action: numpy if isinstance(self.task.action_space, Box): #only clip in continuos setting. for i in range(self.config.action_dim): actions[:, i] = np.clip(actions[:,i], self.task.action_space.low[i], self.task.action_space.high[i]) return actions def eval_step(self, state): network = self.network['exploit'] prediction = network(state) action = to_np(prediction['a']) return action #test function for policy: def test_exploit_policy_performance(self): network = self.network['exploit'] roll_in_length = self.config.horizon storage = Storage(roll_in_length) num_trajs = 0 total_rews = 0 states = self.task.reset() #reset environment, so roll-in from the beignning for i in range(roll_in_length): prediction = network(states) next_states, rewards, terminals, info = self.task.step(to_np(prediction['a'])) num_trajs += terminals.sum() total_rews += rewards.sum() assert num_trajs > 0 return total_rews / num_trajs #this may overestimates rewards...but fair for all baselines as well..
X = mat[colstolearn] X = mat[colstolearn2] X = mat.drop(['y'], axis=1) Y = mat['y'] / med Y = mat['y'] Y = mat[((mat['y'] > 10000) | (mat['y'] < 0)) == False]['y'] / med Y = np.log1p(Y) scaler = StandardScaler() scaler.fit(X) joblib.dump(scaler, 'sklean_scaler1.pkl', compress=True) X = scaler.transform(X) rbf = RBFSampler(gamma=0.05, n_components=100) rbf.fit(X) X = rbf.transform(X) X, Y, med = shuffle(X, Y, med) offset = int(X.shape[0] * 0.2) X_train, y_train = X[:offset], Y[:offset] X_test, y_test = X[offset:], Y[offset:] X_test, y_test, med_test = X[offset:], Y[offset:], med[offset:] n_est = 80 params = { 'loss': 'lad', 'n_estimators': n_est, 'max_depth': 8,
class LSTDQ_Kernel(): def __init__(self, dataset, obs_dim, act_dim, gamma, horizon, value_reg, default_length_scale=0.2, random_feature_per_obs_dim=250, norm=None, scale_length_adjustment='median', dtype=np.float32, policy_net=None, separate_action_indexing=False, action_encoding_scheme='continuous'): self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.horizon = horizon self.norm = norm self.policy_net = policy_net self.value_reg = value_reg self.dtype = dtype self.separate_action_indexing = separate_action_indexing self.action_encoding_scheme = action_encoding_scheme self.n_samples = dataset['obs'].shape[0] self.n_episode = dataset['init_obs'].shape[0] self.non_terminal_idx = (dataset['info'] == False)[:, 0] self.n_samples_non_terminal = self.non_terminal_idx.sum() self.data_acts = dataset['acts'][self.non_terminal_idx] if self.policy_net is not None: self.pi_current = self.policy_net.get_probabilities(dataset['obs']) self.pi_next = self.policy_net.get_probabilities( dataset['next_obs']) self.pi_init = self.policy_net.get_probabilities( dataset['init_obs']) self.pi_term = self.policy_net.get_probabilities( dataset['term_obs']) else: self.pi_current = dataset['target_prob_obs'][self.non_terminal_idx] self.pi_next = dataset['target_prob_next_obs'][ self.non_terminal_idx] self.pi_init = dataset['target_prob_init_obs'] self.pi_term = dataset['target_prob_term_obs'] if self.norm is None: self.obs = dataset['obs'][self.non_terminal_idx] self.next_obs = dataset['next_obs'][self.non_terminal_idx] self.init_obs = dataset['init_obs'] self.term_obs = dataset['term_obs'] elif self.norm == 'std': self.obs_mean = np.mean(dataset['obs'], axis=0, keepdims=True) self.obs_std = np.std(dataset['obs'], axis=0, keepdims=True) self.obs = (dataset['obs'] - self.obs_mean) / self.obs_std self.next_obs = (dataset['next_obs'] - self.obs_mean) / self.obs_std self.init_obs = (dataset['init_obs'] - self.obs_mean) / self.obs_std self.term_obs = (dataset['term_obs'] - self.obs_mean) / self.obs_std else: raise NotImplementedError # pdb.set_trace() #* what if we only whiten over the non-terminal tuples non_terminal_idx = (dataset['info'] == False)[:, 0] obs_mean = np.mean(dataset['obs'][non_terminal_idx], axis=0, keepdims=True) obs_std = np.std(dataset['obs'][non_terminal_idx], axis=0, keepdims=True) # #* re-whiten the observations: self.obs = (self.obs - obs_mean) / obs_std self.next_obs = (self.next_obs - obs_mean) / obs_std self.init_obs = (self.init_obs - obs_mean) / obs_std self.term_obs = (self.term_obs - obs_mean) / obs_std #* if not separate action indexing, we are concatenating (s,a) as input if not self.separate_action_indexing: if self.action_encoding_scheme == 'continuous': encoded_actions = np.linspace(-1, 1, self.act_dim) # mean_action = np.mean(encoded_actions[self.data_acts[non_terminal_idx]]) # std_action = np.std(encoded_actions[self.data_acts[non_terminal_idx]]) mean_action = np.mean(encoded_actions[self.data_acts]) std_action = np.std(encoded_actions[self.data_acts]) self.encoded_actions = (encoded_actions - mean_action) / std_action # self.act = (self.data_acts / (self.act_dim-1)) * 2 -1 # self.act = (self.act - np.mean(self.act, axis=0, keepdims=True))/np.std(self.act, axis=0, keepdims=True) self.act = self.encoded_actions[self.data_acts] self.input = np.concatenate((self.obs, self.act), axis=1) self.input_dim = self.input.shape[1] else: raise NotImplementedError else: self.input = self.obs self.input_dim = self.obs.shape[1] if scale_length_adjustment == 'median': sample_num = 5000 # idx1 = np.random.choice(self.n_samples, sample_num); idx2 = np.random.choice(self.n_samples, sample_num) # idx1 = np.random.choice(np.arange(self.n_samples)[non_terminal_idx], sample_num); idx2 = np.random.choice(np.arange(self.n_samples)[non_terminal_idx], sample_num) idx1 = np.random.choice(self.n_samples_non_terminal, sample_num) idx2 = np.random.choice(self.n_samples_non_terminal, sample_num) # med_dist = np.median(np.square(self.obs[None, idx1, :] - self.obs[idx2, None, :]), axis = (0,1)) med_dist = np.median(np.square(self.input[None, idx1, :] - self.input[idx2, None, :]), axis=(0, 1)) med_dist[ med_dist < 0.01] = 0.01 # enforce a upperbound on the scale-length of the action component self.scale_length_vector = 1.0 / med_dist else: # scale_length_vector = np.ones(self.obs_dim) self.scale_length_vector = np.ones(self.input_dim) # self.scale_length_vector = np.linspace(1,2,5) self.scale_length_vector = np.ones(self.input_dim) self.z_dim = random_feature_per_obs_dim * self.input_dim self.rff = RBFSampler(n_components=self.z_dim, gamma=default_length_scale) self.rff.fit([self.input[0]]) # #* set the fourier feature # transformer_list = [] # # self.z_dim = random_feature_per_obs_dim * self.obs_dim # self.z_dim = random_feature_per_obs_dim * self.input_dim # models = [RBFSampler(n_components = random_feature_per_obs_dim, gamma = default_length_scale*dist) for dist in self.scale_length_vector] # for model in models: # # model.fit([self.obs[0]]) # model.fit([self.input[0]]) # transformer_list.append((str(model), model)) # self.rff = FeatureUnion(transformer_list) # models = [RBFSampler(n_components = random_feature_per_obs_dim, gamma = default_length_scale*dist) for dist in self.scale_length_vector] # for model in models: # # model.fit([self.obs[0]]) # model.fit([self.input[0]]) # transformer_list.append((str(model), model)) # self.rff = [RBFSampler(n_components = random_feature_per_obs_dim, gamma = default_length_scale)] # self.rff.fit([self.input[0]]) #* Some commonly used variables # self.I_sa = np.eye(self.act_dim*self.z_dim) self.rews = dataset['rews'][self.non_terminal_idx] # self.init_idx = np.arange(0, self.n_samples, self.horizon) # self.end_idx = np.arange(self.horizon-1, self.n_samples, self.horizon) self.rho = dataset['ratio'][ self. non_terminal_idx] #* make sure that the importance weights are already calculated # pdb.set_trace() def estimate(self): if self.separate_action_indexing: value_est = self.estimate_LSTDQ_separate_action_indexing() else: value_est = self.estimate_LSTDQ_concat_sa_input() return value_est def estimate_LSTDQ_concat_sa_input(self): # transformed_action = np.linspace(-1,1, self.act_dim) # n_samples = self.non_terminal_idx.sum() a_prime = np.tile(self.encoded_actions, self.n_samples_non_terminal)[:, np.newaxis] # a_prime = np.tile(self.encoded_actions, self.n_samples)[:,np.newaxis] x_prime = np.concatenate( (np.repeat(self.next_obs, self.act_dim, axis=0), a_prime), axis=1) # a0_expanded = np.tile(transformed_action,self.n_episode)[:,np.newaxis] a0_expanded = np.tile(self.encoded_actions, self.n_episode)[:, np.newaxis] x0 = np.concatenate( (np.repeat(self.init_obs, self.act_dim, axis=0), a0_expanded), axis=1) # aterm_expanded = np.tile(transformed_action, self.n_episode)[:,np.newaxis] aterm_expanded = np.tile(self.encoded_actions, self.n_episode)[:, np.newaxis] xterm = np.concatenate( (np.repeat(self.term_obs, self.act_dim, axis=0), aterm_expanded), axis=1) Z = self.rff.transform(self.input).astype(self.dtype) Z_prime = self.rff.transform(x_prime).astype(self.dtype) aprime_probs = self.pi_next.flatten()[:, np.newaxis] Z_prime = Z_prime * aprime_probs Z_prime = Z_prime.reshape((self.n_samples_non_terminal, self.act_dim, self.z_dim)).sum(axis=1) reg = self.value_reg regularized_inverse = np.linalg.inv( np.matmul(Z.T, Z - self.gamma * Z_prime) + reg * np.eye(self.z_dim)) featurized_reward = np.matmul(Z.T, self.rews) value_coef = np.matmul(regularized_inverse, featurized_reward) Z0 = self.rff.transform(x0) Q0 = np.matmul(Z0, value_coef) Z_term = self.rff.transform(xterm) Q_term = np.matmul(Z_term, value_coef) V_init = (Q0 * self.pi_init.flatten()[:, np.newaxis]).reshape( (self.n_episode, self.act_dim)).sum(axis=1) V_term = (Q_term * self.pi_term.flatten()[:, np.newaxis]).reshape( (self.n_episode, self.act_dim)).sum(axis=1) V_traj = V_init - V_term * self.gamma**self.horizon value_est = np.mean(V_traj) # pdb.set_trace() return value_est def estimate_LSTDQ_separate_action_indexing(self): #* separate action set indexing act_idx = [] for i in range(self.act_dim): act_idx.append(np.where(self.data_acts == i)[0]) #* apply transformation Z = self.rff.transform(self.obs).astype(self.dtype) Z_prime = self.rff.transform(self.next_obs).astype(self.dtype) Z_init = self.rff.transform(self.init_obs).astype(self.dtype) Z_term = self.rff.transform(self.term_obs).astype(self.dtype) # import pdb; pdb.set_trace() assert self.z_dim == Z.shape[1] Phi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_pi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_prime_pi = np.zeros( (Z_prime.shape[0], Z_prime.shape[1] * self.act_dim), dtype=self.dtype) Phi_init_pi = np.zeros( (Z_init.shape[0], Z_init.shape[1] * self.act_dim), dtype=self.dtype) Phi_term_pi = np.zeros( (Z_term.shape[0], Z_term.shape[1] * self.act_dim), dtype=self.dtype) for i in range(self.act_dim): Phi[act_idx[i], i * self.z_dim:(i + 1) * self.z_dim] = Z[act_idx[i]] Phi_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_current[:, i][:, None] * Z Phi_prime_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_next[:, i][:, None] * Z_prime Phi_init_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_init[:, i][:, None] * Z_init Phi_term_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_term[:, i][:, None] * Z_term I_sa = np.eye(self.act_dim * self.z_dim, dtype=self.dtype) regularized_inverse = np.linalg.inv( np.matmul(Phi.T, Phi - self.gamma * Phi_prime_pi) + self.value_reg * I_sa) featurized_reward = np.matmul(Phi.T, self.rews) reward_coef = np.matmul(regularized_inverse, featurized_reward) V_init = Phi_init_pi @ reward_coef V_term = Phi_term_pi @ reward_coef V_traj = V_init - V_term * self.gamma**self.horizon value_est = np.mean(V_traj) # import pdb; pdb.set_trace() return value_est
import sys import os import numpy as np import itertools from sklearn.svm import LinearSVC from sklearn.kernel_approximation import RBFSampler from sklearn.kernel_approximation import AdditiveChi2Sampler pycharm_mode = True N_FEATURES = 400 # Dimension of the original data. BATCH_SIZE = 30000 chi = AdditiveChi2Sampler() chi.fit(np.zeros(N_FEATURES).ravel()) rbf = RBFSampler(gamma=1, random_state=1337, n_components=5500) rbf.fit(np.zeros(1200).ravel()) def transform(x_original): return rbf.transform(chi.transform(x_original)).ravel() def lines(source): for line in source: line = line.strip() (label, x_string) = line.split(" ", 1) label = int(label) x_original = np.fromstring(x_string, sep=' ') yield label, transform(x_original) def main(): if pycharm_mode: import argparse
class SarsaLambdaAgent: def __init__(self, environment=gym.make('MountainCar-v0')): self.env = environment self.state = self.env.reset() self.state_low_bound = self.env.observation_space.low self.state_high_bound = self.env.observation_space.high self.n_action = env.action_space.n self.action_space = gym.spaces.Discrete(self.n_action) self.d = 100 self.w = np.random.rand(self.d) self.feature = RBFSampler(gamma=1, random_state=1) X = [] for _ in range(100000): s = env.observation_space.sample() sa = np.append(s, np.random.randint(self.n_action)) X.append(sa) self.feature.fit(X) def feature_x(self, s, a): # print('state = ', s, ' & action = ', a) feature_sa = self.feature.transform([[s[0], s[1], a]]) # print(feature_sa) return feature_sa def is_state_valid(self, s): valid = True for i in range(s.shape[0]): if (s[i] < self.state_low_bound[i]) and (s[i] > self.state_high_bound[i]): valid = False return valid def Q_hat(self, s, a): if self.is_state_valid(s): return np.dot(self.feature_x(s, a), np.transpose(self.w)) def reset(self): self.state = self.env.reset() def A_max(self, state, epsilon): if np.random.rand() < epsilon: # Exploration return np.random.randint(self.n_action) else: # Exploitation max_a = [] maxQ = -np.inf for a in range(0, self.n_action): if self.Q_hat(state, a) > maxQ: max_a = [a] maxQ = self.Q_hat(state, a) elif self.Q_hat(state, a) == maxQ: max_a.append(a) if max_a != []: return max_a[np.random.randint(0, len(max_a))] else: return np.random.randint(self.n_action) def train(self, n_episode=5000, learning_rate=0.01, gamma=0.99, epsilon=0.01, lamda=0.9): num_steps_of_episode = [] for i_episode in range(n_episode): self.reset() n_trajectory = 0 a = self.A_max(state=self.state, epsilon=epsilon) z = np.zeros(self.d) Q_old = 0 while True: s = np.copy(self.state) while True: try: s_, r_, done, _ = self.env.step(a) a_ = self.A_max(state=s_, epsilon=epsilon) # env.render() break except (RuntimeError, TypeError, NameError): print("Action {} at state {} is invalid!".format(a, self.state)) Q = self.Q_hat(s, a) Q_ = self.Q_hat(s_, a) delta = r_ + gamma*Q_ - Q z = gamma * lamda * z + (1 - learning_rate * gamma * lamda * np.dot(self.feature_x(s, a), np.transpose(z))) * self.feature_x(s, a) self.w = self.w + learning_rate * (delta + Q - Q_old) * z - learning_rate * (Q - Q_old) * self.feature_x(s, a) Q_old = Q_ self.state = s_ a = a_ n_trajectory += 1 if done: num_steps_of_episode.append(n_trajectory) if n_trajectory % DISPLAY_STEP == 0: print("Episode = {}, took {} to go to the goal.".format(i_episode, n_trajectory)) break return num_steps_of_episode def get_w(self): return self.w
class CCNNLayer: def __init__(self, name: str, input_size: int, filter_size: int, gamma: float, m: int, R: float, r: int, lr: float): self.name = name self.input_size = input_size self.filter_size = filter_size self.patch_size = filter_size ** 2 self.output_size = self.input_size - self.filter_size + 1 self.n_patchs = self.output_size ** 2 self.m = m self.R = R self.lr = lr self.rbf_feature = RBFSampler(gamma=gamma, n_components=m, random_state=1) self.svd = TruncatedSVD(n_components=r) def initPars(self, n_classes: int, batch_size: int): self.n_classes = n_classes self.batch_size = batch_size self.lr /= batch_size self.A = np.random.normal(0, 0.1, size=(n_classes, self.n_patchs, self.m)) def getZMatrix(self, X): """ Input: (n_instances, n_channels, input_size, input_size) Output: (n_instances, n_patchs, m) """ Z = view_as_windows(X, (1, X.shape[1], self.filter_size, self.filter_size)) Z = Z.reshape(np.prod(Z.shape[:4]), np.prod(Z.shape[4:])) Q = self.rbf_feature.transform(Z).astype(np.float16) return Q.reshape(X.shape[0], self.n_patchs, -1) def predict(self, X, transform: bool=False): """ Input: (batch_size, n_channels, input_size, input_size) Transformed input: (batch_size, n_patchs, m) Output: (batch_size, n_classes) """ Z = self.getZMatrix(X) if transform else X p = np.exp(np.tensordot(Z, self.A, axes=[(1, 2), (1, 2)])) return (p.T / np.sum(p, axis=1)).T def fit(self, X, ylabel, n_epoch: int): assert X.shape[2] == X.shape[3] == self.input_size n = X.shape[0] self.rbf_feature.fit(np.zeros((1, X.shape[1] * self.filter_size ** 2))) print("Preparing patches...") Z_batches = [self.getZMatrix(X[i: i + self.batch_size]) for i in range(0, n, self.batch_size)] y_batches = ylabel.reshape(-1, self.batch_size) print("Starting PSGD...") loss = np.inf rhat = self.m for epoch in range(n_epoch): print("{0}: Epoch {1}: loss = {2}, r_hat = {3}".format(self.name, epoch + 1, loss / n, rhat)) loss = 0 for i, (Z_batch, y_batch) in enumerate(zip(Z_batches, y_batches)): p_batch = self.predict(Z_batch) loss += np.sum(-np.log(p_batch[np.arange(self.batch_size), y_batch])) dL_batch = -p_batch dL_batch[np.arange(self.batch_size), y_batch] += 1 self.A += self.lr * np.tensordot(dL_batch, Z_batch, axes=[0, 0]) A_unfold = self.A.reshape(-1, self.A.shape[2]).T U = self.svd.fit_transform(A_unfold) self.U = U.copy() d = np.linalg.norm(U, axis=0) U *= 1 / d d_cum = np.cumsum(d) rhat = np.searchsorted(d_cum - self.R > np.append(d[1:] * np.arange(1, d.size), 0), True) + 1 if rhat >= d.size: print("Warning: Hard-thresholding applied") if rhat <= d.size: scale = np.maximum(0, d - (d_cum[rhat - 1] - self.R) / rhat) U = U[:, :rhat] d = d[:rhat] self.U = U * scale[:rhat] self.A = ((self.U * (1 / d)) @ (U.T @ A_unfold)).T.reshape(*self.A.shape) Z_batches = None y_batches = None def transform(self, X): """ Input: (batch_size, n_channels, input_size, input_size) Output: (batch_size, n_output_channels, output_size, output_size) """ Z = np.rollaxis(np.tensordot(self.U, self.getZMatrix(X), axes=[0, 2]), 0, 2) return Z.reshape(Z.shape[0], Z.shape[1], self.output_size, self.output_size)
state_samples = np.array( [env.observation_space.sample() for x in range(10000)]) # Num Observation Min Max # 0 position -1.2 0.6 # 1 velocity -0.07 0.07 position_max = np.amax(observation_examples[:, 0]) position_min = np.amin(observation_examples[:, 0]) velocity_max = np.amax(observation_examples[:, 1]) velocity_min = np.amin(observation_examples[:, 1]) scaler = StandardScaler() scaler.fit(state_samples) scaler_samples = scaler.transform(state_samples) featurizer_state = RBFSampler(gamma=0.5, n_components=100) featurizer_state.fit(scaler_samples) print(featurizer_state) state = env.reset() print(observation_examples[20]) featurized = featurizer_state.transform([observation_examples[10]]) # In[75]: class ValueFunction(object): """ Value Funciton approximator. """ def __init__(self): # sampleing envrionment state in order to featurize it.
XtrainT = kpls.transform(ktrain) XtestT = kpls.transform(ktest) if n==573: kplsScoresNys[:,0] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) elif n==1073: kplsScoresNys[:,1] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) elif n==1573: kplsScoresNys[:,2] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) # RBF sampler method elapTimeRBFS = np.zeros(np.shape(nComponents)) kplsScoresRBFS = np.zeros((2,3)) for i,n in enumerate(nComponents): rbfs = RBFSampler(n_components=n,gamma=gamma) rbfs.fit(Xtrain) ktrain = rbfs.transform(Xtrain) ktest = rbfs.transform(Xtest) startTime = timeit.default_timer() kpls.fit(ktrain,Ytrain) elapTimeRBFS[i] = timeit.default_timer() - startTime XtrainT = kpls.transform(ktrain) XtestT = kpls.transform(ktest) if n==573: kplsScoresRBFS[:,0] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) elif n==1073: kplsScoresRBFS[:,1] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) elif n==1573: kplsScoresRBFS[:,2] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)
ngram_range=(2, 4)) # Fit the rbf_sampler with the similarity matrix. column_transformer = make_column_transformer( (similarity_encoder, ['NONPROPRIETARYNAME']), (OneHotEncoder(handle_unknown='ignore'), ['DOSAGEFORMNAME', 'ROUTENAME']), sparse_threshold=1) transformed_categories = column_transformer.fit_transform(X_encoder) # gamma is a parameter of the rbf function, that sets how fast the similarity # between two points should decrease as the distance between them rises. It # is data-specific, and needs to be chosen carefully, for example using # cross-validation. rbf_sampler = RBFSampler(gamma=0.5, n_components=n_out_rbf, random_state=42) rbf_sampler.fit(transformed_categories) def encode(X, y_int, one_hot_encoder, column_transformer, rbf_sampler): X_sim_encoded = column_transformer.transform(X) X_highdim = rbf_sampler.transform(X_sim_encoded.toarray()) y_onehot = one_hot_encoder.transform(y_int.reshape(-1, 1)) return X_highdim, y_onehot # The inputs and labels of the val and test sets have to be pre-processed the # same way the training set was processed: X_test_kernel_approx, y_true_test_onehot = encode(X_test, y_test,
class DecomposableKernel(object): r""" Decomposable Operator-Valued Kernel of the form: .. math:: X, Y \mapsto K(X, Y) = k_s(X, Y) A where A is a symmetric positive semidefinite operator acting on the outputs. Attributes ---------- A : {array, LinearOperator}, shape = [n_targets, n_targets] Linear operator acting on the outputs scalar_kernel : {callable} Callable which associate to the training points X the Gram matrix. scalar_kernel_params : {mapping of string to any} Additional parameters (keyword arguments) for kernel function passed as callable object. References ---------- See also -------- DecomposableKernelMap Decomposable Kernel map Examples -------- >>> import operalib as ovk >>> import numpy as np >>> X = np.random.randn(100, 10) >>> K = ovk.DecomposableKernel(np.eye(2)) >>> # The kernel matrix as a linear operator >>> K(X, X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS <200x200 _CustomLinearOperator with dtype=float64> """ def __init__(self, A, scalar_kernel=rbf_kernel, scalar_kernel_params=None): """Initialize the Decomposable Operator-Valued Kernel. Parameters ---------- A : {array, LinearOperator}, shape = [n_targets, n_targets] Linear operator acting on the outputs scalar_kernel : {callable} Callable which associate to the training points X the Gram matrix. scalar_kernel_params : {mapping of string to any}, optional Additional parameters (keyword arguments) for kernel function passed as callable object. """ self.A = A self.scalar_kernel = scalar_kernel self.scalar_kernel_params = scalar_kernel_params self.p = A.shape[0] def get_kernel_map(self, X): r"""Return the kernel map associated with the data X. .. math:: K_x: Y \mapsto K(X, Y) Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- K_x : DecomposableKernelMap, callable .. math:: K_x: Y \mapsto K(X, Y). """ from .kernel_maps import DecomposableKernelMap return DecomposableKernelMap(X, self.A, self.scalar_kernel, self.scalar_kernel_params) def get_orff_map(self, X, D=100, eps=1e-5, random_state=0): r"""Return the Random Fourier Feature map associated with the data X. .. math:: K_x: Y \mapsto \tilde{\Phi}(X) Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- \tilde{\Phi}(X) : Linear Operator, callable """ u, s, v = svd(self.A, full_matrices=False, compute_uv=True) self.B_ = dot(diag(sqrt(s[s > eps])), v[s > eps, :]) self.r = self.B_.shape[0] if (self.scalar_kernel is rbf_kernel) and not hasattr(self, 'Xb_'): if self.scalar_kernel_params is None: gamma = 1. else: gamma = self.scalar_kernel_params['gamma'] self.phi_ = RBFSampler(gamma=gamma, n_components=D, random_state=random_state) self.phi_.fit(X) self.Xb_ = self.phi_.transform(X).astype(X.dtype) elif (self.scalar_kernel is 'skewed_chi2') and not hasattr(self, 'Xb_'): if self.scalar_kernel_params is None: skew = 1. else: skew = self.scalar_kernel_params['skew'] self.phi_ = SkewedChi2Sampler(skewedness=skew, n_components=D, random_state=random_state) self.phi_.fit(X) self.Xb_ = self.phi_.transform(X).astype(X.dtype) elif not hasattr(self, 'Xb_'): raise NotImplementedError('ORFF map for kernel is not ' 'implemented yet') D = self.phi_.n_components if X is self.Xb_: cshape = (D, self.r) rshape = (self.Xb_.shape[0], self.p) oshape = (self.Xb_.shape[0] * self.p, D * self.r) return LinearOperator(oshape, dtype=self.Xb_.dtype, matvec=lambda b: dot(dot(self.Xb_, b.reshape(cshape)), self.B_), rmatvec=lambda r: dot(Xb.T, dot(r.reshape(rshape), self.B_.T))) else: Xb = self.phi_.transform(X) cshape = (D, self.r) rshape = (X.shape[0], self.p) oshape = (Xb.shape[0] * self.p, D * self.r) return LinearOperator(oshape, dtype=self.Xb_.dtype, matvec=lambda b: dot(dot(Xb, b.reshape(cshape)), self.B_), rmatvec=lambda r: dot(Xb.T, dot(r.reshape(rshape), self.B_.T))) def __call__(self, X, Y=None): r"""Return the kernel map associated with the data X. .. math:: K_x: \begin{cases} Y \mapsto K(X, Y) \enskip\text{if } Y \text{is None,} \\ K(X, Y) \enskip\text{otherwise.} \end{cases} Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples1, n_features] Samples. Y : {array-like, sparse matrix}, shape = [n_samples2, n_features], default = None Samples. Returns ------- K_x : DecomposableKernelMap, callable or LinearOperator .. math:: K_x: \begin{cases} Y \mapsto K(X, Y) \enskip\text{if } Y \text{is None,} \\ K(X, Y) \enskip\text{otherwise} \end{cases} """ Kmap = self.get_kernel_map(X) if Y is None: return Kmap else: return Kmap(Y)
class ApproximateTDAgent: def __init__(self, env, num_episodes=10000): """ Constructor for Temporal Difference Agent using function approximation. The function approximator used is a linear regression with RBF Kernel: y = np.dot(W, ph(x)) :param env: OpenAI Gym environment to interface with :param num_episodes: number of episodes to play to bootstrap phi(x) and W """ # Interface with the environment self.env = env # Initialize featurizer function phi(x) self.featurizer = RBFSampler() samples = [] done = False for n in range(num_episodes): print("Running initial exploration episode: {}".format(n)) s = self.env.reset() while not done: # Play the game randomly a = self.env.action_space.sample() x = self._vectorize(s, a) samples.append(x) s, _, done, _ = self.env.step(a) self.featurizer.fit(samples) self.W = np.zeros(self.featurizer.n_components) def _vectorize(self, s, a): """ Helper function to vectorize state s and action a. :param s: state :type s: tuple :param a: action :type a: int """ s = np.array(s) # One-hot encoding of actions a_vector = np.zeros(self.env.action_space.n) a_vector[a] = 1 return np.concatenate((s, a_vector)) def iterate_policy(self, alpha=0.1, gamma=0.9, epsilon=0.3, num_episodes=1000): """ Implementation of Q learning on the environment """ deltas = [] for n in range(num_episodes): print("Iterating episode {}".format(n)) s = self.env.reset() done = False max_diff = float("-inf") while not done: a = self._select_action(s, epsilon) s_prime, r, done, _ = self.env.step(a) if done: y = r else: y = r + gamma * np.max(self.predict(s_prime)) phi_x = self.featurizer.transform([self._vectorize(s, a)])[0] diff = y - np.dot(self.W, phi_x) self.W = self.W + alpha * diff * phi_x max_diff = max(max_diff, diff) s = s_prime deltas.append(max_diff) return deltas def _select_action(self, state, epsilon): """ Helper function to choose between the explore-exploit dilemma This is actually the pi(a|s) function """ p = np.random.random() if p <= epsilon: selected_action = self.env.action_space.sample() else: Q_values = self.predict(state) selected_action = np.argmax(Q_values) return selected_action def predict(self, state): """ Predict the Q values for all actions of input state :param state: state for which Q is predicted """ # Calculate estimate of Q from dot(W, phi(x)) # This is a linear regression model Q_values = [] for action in range(self.env.action_space.n): x = self._vectorize(state, action) x = self.featurizer.transform([x])[0] Q_values.append(np.dot(self.W, x)) return Q_values def play(self): """ Play the agent according to current policy """ done = False s = self.env.reset() total_rewards = 0 while not done: # Always play according to policy a = self._select_action(s, epsilon=0.0) s, r, done, info = self.env.step(a) self.env.render() total_rewards += r return total_rewards
class RBFDivFreeKernel(object): r""" Divergence-free Operator-Valued Kernel of the form: .. math:: X \mapsto K_X(Y) = exp(-\gamma||X-Y||^2)A_{X,Y}, where, .. math:: A_{X,Y} = 2\gamma(X-Y)(X-T)^T+((d-1)-2\gamma||X-Y||^2 I). Attributes ---------- gamma : {float} RBF kernel parameter. References ---------- See also -------- RBFDivFreeKernelMap Divergence-free Kernel map Examples -------- >>> import operalib as ovk >>> import numpy as np >>> X = np.random.randn(100, 2) >>> K = ovk.RBFDivFreeKernel(1.) >>> # The kernel matrix as a linear operator >>> K(X, X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS <200x200 _CustomLinearOperator with dtype=float64> """ def __init__(self, gamma): """Initialize the Decomposable Operator-Valued Kernel. Parameters ---------- gamma : {float}, shape = [n_targets, n_targets] RBF kernel parameter. """ self.gamma = gamma def get_kernel_map(self, X): r"""Return the kernel map associated with the data X. .. math:: K_x: Y \mapsto K(X, Y) Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- K_x : DecomposableKernelMap, callable .. math:: K_x: Y \mapsto K(X, Y). """ from .kernel_maps import RBFDivFreeKernelMap return RBFDivFreeKernelMap(X, self.gamma) def get_orff_map(self, X, D=100, random_state=0): r"""Return the Random Fourier Feature map associated with the data X. .. math:: K_x: Y \mapsto \tilde{\Phi}(X) Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- \tilde{\Phi}(X) : Linear Operator, callable """ self.r = 1 if not hasattr(self, 'Xb_'): self.phi_ = RBFSampler(gamma=self.gamma, n_components=D, random_state=random_state) self.phi_.fit(X) self.Xb_ = self.phi_.transform(X) self.Xb_ = (self.Xb_.reshape((self.Xb_.shape[0], 1, self.Xb_.shape[1])) * self.phi_.random_weights_.reshape((1, -1, self.Xb_.shape[1]))) self.Xb_ = self.Xb_.reshape((-1, self.Xb_.shape[2])) D = self.phi_.n_components if X is self.Xb_: return LinearOperator(self.Xb_.shape, matvec=lambda b: dot(self.Xb_ * b), rmatvec=lambda r: dot(self.Xb_.T * r)) else: Xb = self.phi_.transform(X) # TODO: # w = self.phi_.random_weights_.reshape((1, -1, Xb.shape[1])) # wn = np.linalg.norm(w) # Xb = (Xb.reshape((Xb.shape[0], 1, Xb.shape[1])) * # wn * np.eye()w np.dot(w.T, w) / wn) Xb = Xb.reshape((-1, Xb.shape[2])) return LinearOperator(Xb.shape, matvec=lambda b: dot(Xb, b), rmatvec=lambda r: dot(Xb.T, r)) def __call__(self, X, Y=None): r"""Return the kernel map associated with the data X. .. math:: K_x: \begin{cases} Y \mapsto K(X, Y) \enskip\text{if } Y \text{is None,} \\ K(X, Y) \enskip\text{otherwise.} \end{cases} Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples1, n_features] Samples. Y : {array-like, sparse matrix}, shape = [n_samples2, n_features], default = None Samples. Returns ------- K_x : DecomposableKernelMap, callable or LinearOperator .. math:: K_x: \begin{cases} Y \mapsto K(X, Y) \enskip\text{if } Y \text{is None,} \\ K(X, Y) \enskip\text{otherwise} \end{cases} """ Kmap = self.get_kernel_map(X) if Y is None: return Kmap else: return Kmap(Y)
class ApproximateTDAgent(TemporalDifferenceAgent): """ An agent that implements the function approximation Q-learning algorithm """ def __init__(self, env, start_state=(0, 0), initial_policy=None, action_space=None): if initial_policy: self.policy = initial_policy else: # Define a random policy if policy is not given self.policy = { (0, 0): "down", (0, 1): "left", (0, 2): "right", (0, 3): "left", (1, 0): "down", (1, 2): "up", (2, 0): "right", (2, 1): "right", (2, 2): "right", } # Initialize action_space if action_space: self.action_space = action_space else: self.action_space = ["up", "down", "left", "right"] # Initialize state of agent self.start_state = start_state # Initialize featurizer function phi(x) self.featurizer = RBFSampler() # Placeholder of weights for linear regression model # Use explore() method to populate W with the right dimensions of a fitted featurizer # This helps to build a linear regression model of dot(W, phi(x)) self.W = None # Initialize V[s] self.V = {} self.num_rows = 0 self.num_columns = 0 for s in env.get_states(): self.num_rows = max(self.num_rows, s[0] + 1) self.num_columns = max(self.num_columns, s[1] + 1) self.V[s] = 0 def explore(self, env, num_episodes=10000): """ Function for agent to randomly explore the gridworld and collect samples for estimating Q(s,a). The more num_episodes, the more data is collected for better estimates. """ samples = [] for n in range(num_episodes): print("Running initial exploration episode: {}".format(n)) s = self.start_state while not env.is_terminal(s): # Play the game randomly a = np.random.choice(self.action_space) x = self._vectorize(s, a) samples.append(x) _, s_prime = env.move(s, a) s = s_prime # Fit the RBF featurizer and initialize weights self.featurizer.fit(samples) self.W = np.zeros(self.featurizer.n_components) def _vectorize(self, s, a): """ Helper function to vectorize state s and action a """ s = np.array(s) # One-hot encoding of actions a_idx = self.action_space.index(a) a = np.zeros(len(self.action_space)) a[a_idx] = 1 return np.concatenate((s, a)) def iterate_policy(self, env, alpha=0.1, gamma=0.9, epsilon=0.3, num_episodes=1000): deltas = [] for n in range(num_episodes): print("Running policy iteration episode: {}".format(n)) max_diff = float("-inf") s = self.start_state while not env.is_terminal(s): a = self._select_action(s, epsilon) r, s_prime = env.move(s, a) if env.is_terminal(s_prime): y = r else: y = r + gamma * np.max(self.predict(s_prime)) phi_x = self.featurizer.transform([self._vectorize(s, a)])[0] diff = y - np.dot(self.W, phi_x) self.W = self.W + alpha * diff * phi_x max_diff = max(max_diff, diff) s = s_prime deltas.append(max_diff) # Update optimal policy and value function for s in env.get_states(): if not env.is_terminal(s): Q_values = self.predict(s) self.policy[s] = self.action_space[np.argmax(Q_values)] self.V[s] = np.max(Q_values) return deltas def _select_action(self, state, epsilon): """ Helper function to choose between the explore-exploit dilemma This is actually the pi(a|s) function """ p = np.random.random() if p <= epsilon: selected_action = np.random.choice(self.action_space) else: Q_values = self.predict(state) selected_action = self.action_space[np.argmax(Q_values)] return selected_action def predict(self, state): """ Predict the Q values for all actions of input state :param state: state for which Q is predicted """ # Calculate estimate of Q from dot(W, phi(x)) # This is a linear regression model Q_values = [] for action in self.action_space: x = self._vectorize(state, action) x = self.featurizer.transform([x])[0] Q_values.append(np.dot(self.W, x)) return Q_values
import numpy as np from sklearn.linear_model import SGDClassifier from sklearn import cross_validation from sklearn import svm from sklearn.kernel_approximation import RBFSampler from sklearn.kernel_approximation import AdditiveChi2Sampler from sklearn.grid_search import GridSearchCV DIMENSION = 400 # Dimension of the original data. CLASSES = (-1, +1) # The classes that we are trying to predict. chi_feature = AdditiveChi2Sampler(sample_steps=1) chi_feature.fit(np.zeros([1,400])) rbf = RBFSampler(n_components = 15*DIMENSION, random_state = 1) rbf.fit(np.zeros([1,400])) def transform(x_original): out = np.concatenate(([1], rbf.transform(chi_feature.transform(x_original)[0])[0])) return out if __name__ == "__main__": X = [] Y = [] # initialize stochastic gradiant descent cls = SGDClassifier(alpha = 0.0001, fit_intercept=False, n_iter = 15, penalty = "l2", warm_start = "True") for line in sys.stdin: line = line.strip() (label, x_string) = line.split(" ", 1) label = int(label) x_original = np.fromstring(x_string, sep=' ')