def sample_chain(x0, N, energy_difference, proposal_stddev, thinning_factor=1, burn_in=0, temperature=1.0): """ Vanilla Monte Carlo Markov Chain that proposes changes according to isotropic a Normal distribution N(0, proposal_stddev). This makes absolutely no use of the fact that we are dealing with an autoencoder apart from the fact that the energy_difference function is usually meant to be obtained from a DAE's reconstruction function. """ if len(x0.shape) != 1: error("Wrong dimension for x0. This function is not vectorial.") if thinning_factor < 1: error( "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples." ) proposal = lambda current_x: current_x + np.random.normal( size=current_x.shape, scale=proposal_stddev) def iterate_N_times(current_x, energy_difference, N): for _ in np.arange(N): proposed_x = proposal(current_x) loga = -energy_difference(proposed_x, current_x) / temperature if loga >= 0 or loga >= np.log(np.random.uniform(0, 1)): # accepted ! current_x = proposed_x iterate_N_times.accepted_counter += 1 else: iterate_N_times.rejected_counter += 1 return current_x iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 current_x = iterate_N_times(current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0, N): current_x = iterate_N_times(current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0 * n / N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / ( iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio)
def sample_chain(x0, N, energy_difference, proposal_stddev, thinning_factor = 1, burn_in = 0, temperature = 1.0): """ Vanilla Monte Carlo Markov Chain that proposes changes according to isotropic a Normal distribution N(0, proposal_stddev). This makes absolutely no use of the fact that we are dealing with an autoencoder apart from the fact that the energy_difference function is usually meant to be obtained from a DAE's reconstruction function. """ if len(x0.shape) != 1: error("Wrong dimension for x0. This function is not vectorial.") if thinning_factor < 1: error("You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples.") proposal = lambda current_x: current_x + np.random.normal(size=current_x.shape, scale=proposal_stddev) def iterate_N_times(current_x, energy_difference, N): for _ in np.arange(N): proposed_x = proposal(current_x) loga = - energy_difference(proposed_x, current_x) / temperature if loga >= 0 or loga >= np.log(np.random.uniform(0,1)): # accepted ! current_x = proposed_x iterate_N_times.accepted_counter += 1 else: iterate_N_times.rejected_counter += 1 return current_x iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 current_x = iterate_N_times(current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0,N): current_x = iterate_N_times(current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0*n/N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio)
def sample_chain(x0, N, energy_difference, noise_levels, r, r_prime, thinning_factor = 1, burn_in = 0, accept_all_proposals = False, proposal_noise_scheme = 'merge_x', omit_asymmetric_proposal_factor = False): """ Will sample N values for the chain starting with x0. noise_levels is a dict with keys ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"] """ print proposal_noise_scheme assert len(x0.shape) == 1, "Wrong dimension for x0." assert thinning_factor >= 1, "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples." train_stddev = noise_levels["train_stddev"] langevin_stddev = noise_levels["langevin_stddev"] langevin_beta = noise_levels["langevin_beta"] temperature = noise_levels["temperature"] def langevin_proposal(current_x, preimage_current_x): # We are using the term "preimage" here because it corresponds # to the preimage when langevin_beta=1.0. # Otherwise, it should be called the "noisy_ancestor" or something # like that to reflect the fact that it's more about # # x_{\textrm{noisy}}^{(t)}&=&x^{(t)}+\epsilon\hspace{1em}for\hspace{1em}\epsilon\sim\mathcal{N}(0,\sigma^{2}) # x^{*}&=&\left(1-\beta\right)x_{\textrm{noisy}}^{(t)}+\beta r^{*}(x_{\textrm{noisy}}^{(t)}) # # than about being the preimage. Latex the stuff above to read it properly. # This function accesses the variables from the "closure" : accept_all_proposals, proposal_noise_scheme d = current_x.shape[0] if proposal_noise_scheme == 'merge_x': preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev) proposed_x = (1-langevin_beta) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x) elif proposal_noise_scheme == 'noise_E': preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev) proposed_x = current_x - langevin_beta * preimage_proposed_x + langevin_beta * r(preimage_proposed_x) elif proposal_noise_scheme == 'noise_r': preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev) proposed_x = (1-langevin_beta)*current_x + langevin_beta * r(preimage_proposed_x) else: raise("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme) if accept_all_proposals or omit_asymmetric_proposal_factor: asymmetric_correction_log_factor = 0.0 else: # Now we need to compute # log q( current_x | proposed_x ) - log q( proposed_x | current_x ) A = np.zeros((2,)) B = np.zeros((2,)) A[0] = - 0.5/langevin_stddev**2 * ((preimage_current_x - proposed_x)**2).sum() B[0] = - 0.5/langevin_stddev**2 * ((preimage_proposed_x - current_x)**2).sum() if proposal_noise_scheme == 'merge_x': A[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x)) ) B[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x)) ) elif proposal_noise_scheme == 'noise_E': # clueless A[1] = -1 * np.log( np.linalg.det( (-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x)) ) B[1] = -1 * np.log( np.linalg.det( (-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x)) ) #pass elif proposal_noise_scheme == 'noise_r': # clueless A[1] = -1 * np.log( np.linalg.det( langevin_beta * r_prime(preimage_current_x)) ) B[1] = -1 * np.log( np.linalg.det( langevin_beta * r_prime(preimage_proposed_x)) ) #pass else: raise("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme) asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1] return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) def iterate_N_times(current_x, preimage_current_x, energy_difference, N): for _ in np.arange(N): (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = langevin_proposal(current_x, preimage_current_x) if accept_all_proposals: loga = 0.0 else: # This is a - in front of the energy difference because # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x) loga = - energy_difference(proposed_x, current_x) / temperature + asymmetric_correction_log_factor # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor if accept_all_proposals or loga >= 0.0 or loga >= np.log(np.random.uniform(0,1)): # accepted ! current_x = proposed_x preimage_current_x = preimage_proposed_x iterate_N_times.accepted_counter += 1 else: iterate_N_times.rejected_counter += 1 return (current_x, preimage_current_x) iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 # not quite the actual pre-image, but it's just for initialization purposes preimage_current_x = current_x (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0,N): (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0*n/N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio, noise_levels)
def fit_with_stddevs_sequence(self, X, X_valid, stddevs, optimization_args): """ stddevs has fields 'train', 'valid' and any number of other variants on 'valid'. The special key is 'train', used for training. The validation errors are computed with all the other keys that contain information about the stddev. Obviously, we want to use one called 'valid', but we can also have different alternatives such as 'alt_valid' or 'valid2' with a different sequence of stddevs. stddevs is of the form {'train' : [{'target' : 1.0, 'sampled' : 4.0}, {'target' : 0.8, 'sampled' : 3.0}, ... ], 'valid' : [{'target' : 1.0, 'sampled' : 4.0}, {'target' : 0.8, 'sampled' : 3.0}, ... ], ... } X is an array of shape (n_train, d) X_valid is an array of shape (n_valid, d). It can be None. optimisation_args passed through to the method 'fit' of this class. example of optimation_args : {'method' : 'fmin_l_bfgs_b', 'maxiter' : maxiter, 'm':lbfgs_rank} Returns the losses for all the stddevs. The variable 'best_q_mean_losses'. """ validate_the_stddevs_argument(stddevs) # the walkback_vector_func is the function r(x) that # we have from this DAE walkback_vector_func = lambda X: self.encode_decode(X) progress_logger = make_progress_logger("Training") best_q_mean_losses = dict([(key, []) for key in stddevs.keys()]) # Summary : # Everything that follows is just a way to mutate the value of 'best_q'. # That 'best_q' variable contains the learned parameters. # We log various things based on the current value of 'best_q' and # the datasets (X, X_valid). # At the end of the day, we're left with 'best_q' and stuff logged # in 'best_q_mean_losses' to make an informed decision about the # usefulness of the model learned. M = len(stddevs["train"]) for m in range(0, M): e = stddevs["train"][m] (noisy_X, importance_sampling_weights) = get_noisy_X_and_importance_weights(X, e, walkback_vector_func) (best_q, train_U_best_q) = self.fit(X, noisy_X, importance_sampling_weights, optimization_args) train_mean_U_best_q = train_U_best_q / X.shape[0] best_q_mean_losses["train"].append(train_mean_U_best_q) sys.stdout.write(" train mean loss is %f\n" % (train_mean_U_best_q,)) if X_valid is not None: for key in stddevs.keys(): if key == "train": continue e = stddevs[key][m] if e["sampled"] is None: best_q_mean_losses[key].append(None) continue (noisy_X_valid, importance_sampling_weights) = get_noisy_X_and_importance_weights( X_valid, e, walkback_vector_func ) some_valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() # Notice : Despite the importance_sampling_weights being used, # I think that we are still doing the right thing by normalizing by # X_valid.shape[0]. I was a bit afraid that we'd be throwing off everything # by using these coefficients, but now I think that we won't find ourselves # in a situation where the validation loss will be useless because of the # wild importance sampling weights. some_valid_mean_U_best_q = some_valid_U_best_q / X_valid.shape[0] best_q_mean_losses[key].append(some_valid_mean_U_best_q) sys.stdout.write(" %s mean loss is %f\n" % (str(key), some_valid_mean_U_best_q)) progress_logger(1.0 * (m + 1) / M) return best_q_mean_losses
def fit_with_decreasing_noise( self, X, list_of_train_stddev, optimization_args, early_termination_args={}, X_valid=None, list_of_additional_valid_stddev=None, ): """ The 'optimization_args' filters through to the 'fit' function almost unchanged. There is the option of adding a a special provision for it's 'maxiter' entry when we get a list. In such a situation, we use one value of maxiter from the list for each value of list_of_train_stddev. The 'early_termination_args' is optional. It provides a way to stop the training if we determine that we started in a state that was irredeemable and would only lead to a bad local minimum. We can keep in mind the r(x) = x solution as a benchmark and observe that, with r(x) = x we would have a loss function that roughly equals d * train_stddev**2, where d is the dimension of the data. The 'early_termination_args' dict has one key for now. early_termination_args['stop_if_loss_greater_than'] = [...] or early_termination_args['stop_if_loss_greater_than'] = "auto" If X_valid is not None, we will also return the values of the objective function evaluated with those validation samples. Those values will be the onces according to which we will decide to stop or not the descent with the train_stddev values. """ # If we were passed the argument "auto", we have to replace the # value with an array of corresponding values. if ( early_termination_args.has_key("stop_if_loss_greater_than") and type(early_termination_args["stop_if_loss_greater_than"]) == str ): if early_termination_args["stop_if_loss_greater_than"] == "auto": early_termination_args["stop_if_loss_greater_than"] = [ X.shape[1] * train_stddev ** 2 for train_stddev in list_of_train_stddev ] print "early termination with losses : " print early_termination_args["stop_if_loss_greater_than"] else: print "Wrong value for early_termination_args. Only valid string is 'auto'." print "Exiting." quit() # at some point we might want to decide to # record all the best_q for the sequence seq_train_mean_best_U_q = [] seq_valid_mean_best_U_q = [] i = 0 progress_logger = make_progress_logger("Training") for train_stddev in list_of_train_stddev: sys.stdout.write(" Using train_stddev %f, " % train_stddev) (noisy_X, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X, 4.0 * train_stddev, train_stddev ) # noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev) if optimization_args.has_key("maxiter") and type(optimization_args["maxiter"]) in [list, np.array]: assert len(optimization_args["maxiter"]) == len(list_of_train_stddev) optimization_args0 = conj(optimization_args, "maxiter", optimization_args["maxiter"][i]) else: optimization_args0 = optimization_args (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0) # (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args) train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum() # sanity check to make sure that we're evaluating this right assert abs(train_U_best_q - train_U_best_q_) < 1e-8 train_mean_U_best_q = train_U_best_q / X.shape[0] seq_train_mean_best_U_q.append(train_mean_U_best_q) sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q,)) if not (X_valid == None): (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev ) # noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev) valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0] seq_valid_mean_best_U_q.append(valid_mean_U_best_q) sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q,)) # if we're dealing with a validation set, it will be the one used # to determine the stopping point if ( early_termination_args.has_key("stop_if_loss_greater_than") and early_termination_args["stop_if_loss_greater_than"][i] < valid_mean_U_best_q ): break else: # if we don't have a validation set, then we'll use mean_U_best_q # for the termination condition if ( early_termination_args.has_key("stop_if_loss_greater_than") and early_termination_args["stop_if_loss_greater_than"][i] < mean_U_best_q ): break print "" progress_logger(1.0 * i / len(list_of_train_stddev)) i += 1 # end for # might as well pad the rest of the list to # signify that we terminated early while len(seq_train_mean_best_U_q) < len(list_of_train_stddev): seq_train_mean_best_U_q.append(np.nan) while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev): seq_valid_mean_best_U_q.append(np.nan) # Now we want to recompute the model losses for all the values of # the train_stddev, but using the final parameters best_q. # This will be used as an addition quality evaluation to determine # how the DAE treats data that's relatively far from the manifold # once it's done training. # It might be even more informative than the validation losses. seq_valid_mean_U_final_best_q = None seq_alt_valid_mean_U_final_best_q = None if not (X_valid == None): nreps = 10 # This thing doesn't work with the list comprehension. You need to generate the data every time. (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev ) seq_valid_mean_U_final_best_q = [ np.array( [ self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() / X_valid.shape[0] for _ in range(nreps) ] ).mean() for train_stddev in list_of_train_stddev ] if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0: # TODO : use some kind of tool to generate the importance_sampling_weights seq_alt_valid_mean_U_final_best_q = [ np.array( [ self.q_loss( best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev) ).sum() / X_valid.shape[0] for _ in range(nreps) ] ).mean() for alt_valid_stddev in list_of_additional_valid_stddev ] # end if return ( seq_train_mean_best_U_q, seq_valid_mean_best_U_q, seq_valid_mean_U_final_best_q, seq_alt_valid_mean_U_final_best_q, )
def sample_chain(x0, N, energy_difference, noise_levels, r, r_prime, thinning_factor=1, burn_in=0, accept_all_proposals=False, proposal_noise_scheme='merge_x', omit_asymmetric_proposal_factor=False): """ Will sample N values for the chain starting with x0. noise_levels is a dict with keys ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"] """ print proposal_noise_scheme assert len(x0.shape) == 1, "Wrong dimension for x0." assert thinning_factor >= 1, "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples." train_stddev = noise_levels["train_stddev"] langevin_stddev = noise_levels["langevin_stddev"] langevin_beta = noise_levels["langevin_beta"] temperature = noise_levels["temperature"] def langevin_proposal(current_x, preimage_current_x): # We are using the term "preimage" here because it corresponds # to the preimage when langevin_beta=1.0. # Otherwise, it should be called the "noisy_ancestor" or something # like that to reflect the fact that it's more about # # x_{\textrm{noisy}}^{(t)}&=&x^{(t)}+\epsilon\hspace{1em}for\hspace{1em}\epsilon\sim\mathcal{N}(0,\sigma^{2}) # x^{*}&=&\left(1-\beta\right)x_{\textrm{noisy}}^{(t)}+\beta r^{*}(x_{\textrm{noisy}}^{(t)}) # # than about being the preimage. Latex the stuff above to read it properly. # This function accesses the variables from the "closure" : accept_all_proposals, proposal_noise_scheme d = current_x.shape[0] if proposal_noise_scheme == 'merge_x': preimage_proposed_x = current_x + np.random.normal( size=(d, ), scale=langevin_stddev) proposed_x = ( 1 - langevin_beta ) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x) elif proposal_noise_scheme == 'noise_E': preimage_proposed_x = current_x + np.random.normal( size=(d, ), scale=langevin_stddev) proposed_x = current_x - langevin_beta * preimage_proposed_x + langevin_beta * r( preimage_proposed_x) elif proposal_noise_scheme == 'noise_r': preimage_proposed_x = current_x + np.random.normal( size=(d, ), scale=langevin_stddev) proposed_x = (1 - langevin_beta) * current_x + langevin_beta * r( preimage_proposed_x) else: raise ("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme) if accept_all_proposals or omit_asymmetric_proposal_factor: asymmetric_correction_log_factor = 0.0 else: # Now we need to compute # log q( current_x | proposed_x ) - log q( proposed_x | current_x ) A = np.zeros((2, )) B = np.zeros((2, )) A[0] = -0.5 / langevin_stddev**2 * ( (preimage_current_x - proposed_x)**2).sum() B[0] = -0.5 / langevin_stddev**2 * ( (preimage_proposed_x - current_x)**2).sum() if proposal_noise_scheme == 'merge_x': A[1] = -1 * np.log( np.linalg.det((1 - langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x))) B[1] = -1 * np.log( np.linalg.det( (1 - langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x))) elif proposal_noise_scheme == 'noise_E': # clueless A[1] = -1 * np.log( np.linalg.det((-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x))) B[1] = -1 * np.log( np.linalg.det( (-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x))) #pass elif proposal_noise_scheme == 'noise_r': # clueless A[1] = -1 * np.log( np.linalg.det(langevin_beta * r_prime(preimage_current_x))) B[1] = -1 * np.log( np.linalg.det( langevin_beta * r_prime(preimage_proposed_x))) #pass else: raise ("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme) asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1] return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) def iterate_N_times(current_x, preimage_current_x, energy_difference, N): for _ in np.arange(N): (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = langevin_proposal( current_x, preimage_current_x) if accept_all_proposals: loga = 0.0 else: # This is a - in front of the energy difference because # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x) loga = -energy_difference( proposed_x, current_x) / temperature + asymmetric_correction_log_factor # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor if accept_all_proposals or loga >= 0.0 or loga >= np.log( np.random.uniform(0, 1)): # accepted ! current_x = proposed_x preimage_current_x = preimage_proposed_x iterate_N_times.accepted_counter += 1 else: iterate_N_times.rejected_counter += 1 return (current_x, preimage_current_x) iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 # not quite the actual pre-image, but it's just for initialization purposes preimage_current_x = current_x (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0, N): (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0 * n / N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / ( iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio, noise_levels)
def sample_chain(x0, N, energy_difference, noise_levels, r, r_prime, f_prime, thinning_factor=1, burn_in=0, accept_all_proposals=False, proposal_noise_scheme='merge_x', omit_asymmetric_proposal_factor=False): """ f g X -----> H -----> X dim(X) = m dim(H) = n r = g * f In this implementation, we use the following shapes for the arguments. r : R^m -> R^n r_prime : R^m -> R^m f_prime : R^m -> R^n energy_difference : (R^m, R^m) -> R proposed_x, current_x |-> log(p(proposed_x)) - log(p(current_x)) noise_levels is a dict with keys ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"] """ assert len(x0.shape) == 1, "Wrong dimension for x0." assert f_prime train_stddev = noise_levels["train_stddev"] langevin_stddev = noise_levels["langevin_stddev"] langevin_beta = noise_levels["langevin_beta"] temperature = noise_levels["temperature"] # TODO : use an equivalent to the 'proposal_noise_scheme' assert proposal_noise_scheme == "merge_x" def proposal(current_x, preimage_current_x): want_renormalization_of_J = False d = current_x.shape[0] if want_renormalization_of_J: M = f_prime(current_x) J = M / np.linalg.norm(M, 2) * langevin_stddev del M else: J = f_prime(current_x) * langevin_stddev det_JTJ = np.linalg.det(J.T.dot(J)) z = np.random.normal(size=J.shape[0]) preimage_proposed_x = current_x + J.T.dot(z) proposed_x = (1 - langevin_beta) * preimage_proposed_x + langevin_beta * r( preimage_proposed_x) if omit_asymmetric_proposal_factor: asymmetric_correction_log_factor = 0.0 else: if want_renormalization_of_J: M = f_prime(proposed_x) proposed_J = M / np.linalg.norm(M, 2) * langevin_stddev del M else: proposed_J = f_prime(proposed_x) * langevin_stddev det_proposed_JTJ = np.linalg.det(proposed_J.T.dot(proposed_J)) # Bear in mind that the covariance of the mvn stemming from current_x # will be J^T J and not just J. assert J.shape[1] == d assert proposed_J.shape[1] == d #print "======================" #print J.T.dot( J ) #print proposed_J.T.dot( proposed_J ) #print "======================" # We will essentially bypass the SVD decomposition by # using J^T J instead of V^T D^2 V from the SVD. # The two quantities are equivalent. # It would still be nice, in a way, to have access to the eigenvalues # in order to have more control (truncating ?) and be able to log them # as some kind of sanity check (to check Yoshua's fast decay intuition). # Now we need to compute # log q( current_x | proposed_x ) - log q( proposed_x | current_x ) A = np.zeros((2, )) v = (preimage_current_x - proposed_x) A[0] = -0.5 * d * np.log( 2 * np.pi) - 0.5 * np.log(det_proposed_JTJ) - 0.5 * v.dot( np.linalg.inv(proposed_J.T.dot(proposed_J))).dot(v) A[1] = -1 * np.log( np.linalg.det((1 - langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x))) B = np.zeros((2, )) v = (preimage_proposed_x - current_x) B[0] = -0.5 * d * np.log(2 * np.pi) - 0.5 * np.log( det_JTJ) - 0.5 * v.dot(np.linalg.inv(J.T.dot(J))).dot(v) B[1] = -1 * np.log( np.linalg.det((1 - langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x))) asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1] # end if omit_asymmetric_proposal_factor return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) # end of proposal function def iterate_N_times(current_x, preimage_current_x, energy_difference, N): for _ in np.arange(N): (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = proposal( current_x, preimage_current_x) # This is a - in front of the energy difference because # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x) # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor loga = -energy_difference( proposed_x, current_x) / temperature + asymmetric_correction_log_factor if accept_all_proposals or loga >= 0 or loga >= np.log( np.random.uniform(0, 1)): # accepted ! current_x = proposed_x preimage_current_x = preimage_proposed_x iterate_N_times.accepted_counter += 1 # DEBUG #print "Accepted transition with loga = %0.2f" % loga #print proposed_x else: iterate_N_times.rejected_counter += 1 # DEBUG #print "Rejected transition with loga = %0.2f" % loga #print proposed_x return (current_x, preimage_current_x) iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 # not quite the actual pre-image, but it's just for initialization purposes preimage_current_x = current_x (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0, N): (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0 * n / N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / ( iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio, noise_levels)
def sample_chain(x0, N, energy_difference, noise_levels, r, r_prime, f_prime, thinning_factor = 1, burn_in = 0, accept_all_proposals = False, proposal_noise_scheme = 'merge_x', omit_asymmetric_proposal_factor = False): """ f g X -----> H -----> X dim(X) = m dim(H) = n r = g * f In this implementation, we use the following shapes for the arguments. r : R^m -> R^n r_prime : R^m -> R^m f_prime : R^m -> R^n energy_difference : (R^m, R^m) -> R proposed_x, current_x |-> log(p(proposed_x)) - log(p(current_x)) noise_levels is a dict with keys ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"] """ assert len(x0.shape) == 1, "Wrong dimension for x0." assert f_prime train_stddev = noise_levels["train_stddev"] langevin_stddev = noise_levels["langevin_stddev"] langevin_beta = noise_levels["langevin_beta"] temperature = noise_levels["temperature"] # TODO : use an equivalent to the 'proposal_noise_scheme' assert proposal_noise_scheme == "merge_x" def proposal(current_x, preimage_current_x): want_renormalization_of_J = False d = current_x.shape[0] if want_renormalization_of_J: M = f_prime(current_x) J = M / np.linalg.norm(M,2) * langevin_stddev del M else: J = f_prime(current_x) * langevin_stddev det_JTJ = np.linalg.det(J.T.dot(J)) z = np.random.normal(size=J.shape[0]) preimage_proposed_x = current_x + J.T.dot(z) proposed_x = (1-langevin_beta) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x) if omit_asymmetric_proposal_factor: asymmetric_correction_log_factor = 0.0 else: if want_renormalization_of_J: M = f_prime(proposed_x) proposed_J = M / np.linalg.norm(M,2) * langevin_stddev del M else: proposed_J = f_prime(proposed_x) * langevin_stddev det_proposed_JTJ = np.linalg.det(proposed_J.T.dot(proposed_J)) # Bear in mind that the covariance of the mvn stemming from current_x # will be J^T J and not just J. assert J.shape[1] == d assert proposed_J.shape[1] == d #print "======================" #print J.T.dot( J ) #print proposed_J.T.dot( proposed_J ) #print "======================" # We will essentially bypass the SVD decomposition by # using J^T J instead of V^T D^2 V from the SVD. # The two quantities are equivalent. # It would still be nice, in a way, to have access to the eigenvalues # in order to have more control (truncating ?) and be able to log them # as some kind of sanity check (to check Yoshua's fast decay intuition). # Now we need to compute # log q( current_x | proposed_x ) - log q( proposed_x | current_x ) A = np.zeros((2,)) v = (preimage_current_x - proposed_x) A[0] = - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(det_proposed_JTJ) - 0.5 * v.dot(np.linalg.inv(proposed_J.T.dot(proposed_J))).dot(v) A[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x)) ) B = np.zeros((2,)) v = (preimage_proposed_x - current_x) B[0] = - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(det_JTJ) - 0.5 * v.dot(np.linalg.inv(J.T.dot(J))).dot(v) B[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x)) ) asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1] # end if omit_asymmetric_proposal_factor return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) # end of proposal function def iterate_N_times(current_x, preimage_current_x, energy_difference, N): for _ in np.arange(N): (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = proposal(current_x, preimage_current_x) # This is a - in front of the energy difference because # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x) # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor loga = - energy_difference(proposed_x, current_x) / temperature + asymmetric_correction_log_factor if accept_all_proposals or loga >= 0 or loga >= np.log(np.random.uniform(0,1)): # accepted ! current_x = proposed_x preimage_current_x = preimage_proposed_x iterate_N_times.accepted_counter += 1 # DEBUG #print "Accepted transition with loga = %0.2f" % loga #print proposed_x else: iterate_N_times.rejected_counter += 1 # DEBUG #print "Rejected transition with loga = %0.2f" % loga #print proposed_x return (current_x, preimage_current_x) iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 # Start with the burn-in iterations. current_x = x0 # not quite the actual pre-image, but it's just for initialization purposes preimage_current_x = current_x (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in) # Then we can think about collecting samples. samples_list = [] # Start from the 'current_x' from the burn_in # and not from x0. Reset the acceptance counters. iterate_N_times.accepted_counter = 0 iterate_N_times.rejected_counter = 0 progress_logger = make_progress_logger("Sampling") for n in np.arange(0,N): (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor) # collect sample after running through the thinning iterations samples_list.append(current_x) progress_logger(1.0*n/N) samples = np.vstack(samples_list) acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter) return (samples, acceptance_ratio, noise_levels)
def fit_with_decreasing_noise(self, X, list_of_train_stddev, optimization_args, early_termination_args={}, X_valid=None, list_of_additional_valid_stddev=None): """ The 'optimization_args' filters through to the 'fit' function almost unchanged. There is the option of adding a a special provision for it's 'maxiter' entry when we get a list. In such a situation, we use one value of maxiter from the list for each value of list_of_train_stddev. The 'early_termination_args' is optional. It provides a way to stop the training if we determine that we started in a state that was irredeemable and would only lead to a bad local minimum. We can keep in mind the r(x) = x solution as a benchmark and observe that, with r(x) = x we would have a loss function that roughly equals d * train_stddev**2, where d is the dimension of the data. The 'early_termination_args' dict has one key for now. early_termination_args['stop_if_loss_greater_than'] = [...] or early_termination_args['stop_if_loss_greater_than'] = "auto" If X_valid is not None, we will also return the values of the objective function evaluated with those validation samples. Those values will be the onces according to which we will decide to stop or not the descent with the train_stddev values. """ # If we were passed the argument "auto", we have to replace the # value with an array of corresponding values. if (early_termination_args.has_key('stop_if_loss_greater_than') and type(early_termination_args['stop_if_loss_greater_than']) == str): if early_termination_args['stop_if_loss_greater_than'] == "auto": early_termination_args['stop_if_loss_greater_than'] = [ X.shape[1] * train_stddev**2 for train_stddev in list_of_train_stddev ] print "early termination with losses : " print early_termination_args['stop_if_loss_greater_than'] else: print "Wrong value for early_termination_args. Only valid string is 'auto'." print "Exiting." quit() # at some point we might want to decide to # record all the best_q for the sequence seq_train_mean_best_U_q = [] seq_valid_mean_best_U_q = [] i = 0 progress_logger = make_progress_logger("Training") for train_stddev in list_of_train_stddev: sys.stdout.write(" Using train_stddev %f, " % train_stddev) (noisy_X, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X, 4.0 * train_stddev, train_stddev) #noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev) if optimization_args.has_key('maxiter') and type( optimization_args['maxiter']) in [list, np.array]: assert len( optimization_args['maxiter']) == len(list_of_train_stddev) optimization_args0 = conj(optimization_args, "maxiter", optimization_args['maxiter'][i]) else: optimization_args0 = optimization_args (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0) #(best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args) train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum() # sanity check to make sure that we're evaluating this right assert (abs(train_U_best_q - train_U_best_q_) < 1e-8) train_mean_U_best_q = train_U_best_q / X.shape[0] seq_train_mean_best_U_q.append(train_mean_U_best_q) sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q, )) if not (X_valid == None): (noisy_X_valid, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev) #noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev) valid_U_best_q = self.q_loss( best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0] seq_valid_mean_best_U_q.append(valid_mean_U_best_q) sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q, )) # if we're dealing with a validation set, it will be the one used # to determine the stopping point if (early_termination_args.has_key('stop_if_loss_greater_than') and early_termination_args['stop_if_loss_greater_than'][i] < valid_mean_U_best_q): break else: # if we don't have a validation set, then we'll use mean_U_best_q # for the termination condition if (early_termination_args.has_key('stop_if_loss_greater_than') and early_termination_args['stop_if_loss_greater_than'][i] < mean_U_best_q): break print "" progress_logger(1.0 * i / len(list_of_train_stddev)) i += 1 # end for # might as well pad the rest of the list to # signify that we terminated early while len(seq_train_mean_best_U_q) < len(list_of_train_stddev): seq_train_mean_best_U_q.append(np.nan) while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev): seq_valid_mean_best_U_q.append(np.nan) # Now we want to recompute the model losses for all the values of # the train_stddev, but using the final parameters best_q. # This will be used as an addition quality evaluation to determine # how the DAE treats data that's relatively far from the manifold # once it's done training. # It might be even more informative than the validation losses. seq_valid_mean_U_final_best_q = None seq_alt_valid_mean_U_final_best_q = None if not (X_valid == None): nreps = 10 # This thing doesn't work with the list comprehension. You need to generate the data every time. (noisy_X_valid, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev) seq_valid_mean_U_final_best_q = [ np.array([ self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() / X_valid.shape[0] for _ in range(nreps) ]).mean() for train_stddev in list_of_train_stddev ] if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0: # TODO : use some kind of tool to generate the importance_sampling_weights seq_alt_valid_mean_U_final_best_q = [ np.array([ self.q_loss( best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev)).sum() / X_valid.shape[0] for _ in range(nreps) ]).mean() for alt_valid_stddev in list_of_additional_valid_stddev ] # end if return (seq_train_mean_best_U_q, seq_valid_mean_best_U_q, seq_valid_mean_U_final_best_q, seq_alt_valid_mean_U_final_best_q)
def fit_with_stddevs_sequence(self, X, X_valid, stddevs, optimization_args): """ stddevs has fields 'train', 'valid' and any number of other variants on 'valid'. The special key is 'train', used for training. The validation errors are computed with all the other keys that contain information about the stddev. Obviously, we want to use one called 'valid', but we can also have different alternatives such as 'alt_valid' or 'valid2' with a different sequence of stddevs. stddevs is of the form {'train' : [{'target' : 1.0, 'sampled' : 4.0}, {'target' : 0.8, 'sampled' : 3.0}, ... ], 'valid' : [{'target' : 1.0, 'sampled' : 4.0}, {'target' : 0.8, 'sampled' : 3.0}, ... ], ... } X is an array of shape (n_train, d) X_valid is an array of shape (n_valid, d). It can be None. optimisation_args passed through to the method 'fit' of this class. example of optimation_args : {'method' : 'fmin_l_bfgs_b', 'maxiter' : maxiter, 'm':lbfgs_rank} Returns the losses for all the stddevs. The variable 'best_q_mean_losses'. """ validate_the_stddevs_argument(stddevs) # the walkback_vector_func is the function r(x) that # we have from this DAE walkback_vector_func = lambda X: self.encode_decode(X) progress_logger = make_progress_logger("Training") best_q_mean_losses = dict([(key, []) for key in stddevs.keys()]) # Summary : # Everything that follows is just a way to mutate the value of 'best_q'. # That 'best_q' variable contains the learned parameters. # We log various things based on the current value of 'best_q' and # the datasets (X, X_valid). # At the end of the day, we're left with 'best_q' and stuff logged # in 'best_q_mean_losses' to make an informed decision about the # usefulness of the model learned. M = len(stddevs['train']) for m in range(0, M): e = stddevs['train'][m] (noisy_X, importance_sampling_weights) = get_noisy_X_and_importance_weights( X, e, walkback_vector_func) (best_q, train_U_best_q) = self.fit(X, noisy_X, importance_sampling_weights, optimization_args) train_mean_U_best_q = train_U_best_q / X.shape[0] best_q_mean_losses['train'].append(train_mean_U_best_q) sys.stdout.write(" train mean loss is %f\n" % (train_mean_U_best_q, )) if X_valid is not None: for key in stddevs.keys(): if key == 'train': continue e = stddevs[key][m] if e['sampled'] is None: best_q_mean_losses[key].append(None) continue (noisy_X_valid, importance_sampling_weights ) = get_noisy_X_and_importance_weights( X_valid, e, walkback_vector_func) some_valid_U_best_q = self.q_loss( best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() # Notice : Despite the importance_sampling_weights being used, # I think that we are still doing the right thing by normalizing by # X_valid.shape[0]. I was a bit afraid that we'd be throwing off everything # by using these coefficients, but now I think that we won't find ourselves # in a situation where the validation loss will be useless because of the # wild importance sampling weights. some_valid_mean_U_best_q = some_valid_U_best_q / X_valid.shape[ 0] best_q_mean_losses[key].append(some_valid_mean_U_best_q) sys.stdout.write(" %s mean loss is %f\n" % ( str(key), some_valid_mean_U_best_q, )) progress_logger(1.0 * (m + 1) / M) return best_q_mean_losses