print "TIMER load_data", time.clock() - ttt test_indices_rec = [] teach_indices_rec = [] alpha_predicted = [] alpha_target = [] energy_target = [] energy_error = [] # -------------------------------------------- # Setup a Gaussian Process once and for all so that parameters do not change # -------------------------------------------- gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) # -------------------------------------------- # Loop over different training sets of the same size # -------------------------------------------- for iteration in range(Ndatabases): # -------------------------------------------- # Pick Ntest configurations randomly # -------------------------------------------- test_indices = list(sp.random.randint(0, high=dataset[target_property].size, size=Ntest)) db_indices = randint_norepeat(0, exclude=test_indices, high=dataset[target_property].size, size=Nteach) teach_indices_rec.append(db_indices) X = dataset['X'][test_indices + db_indices] T = dataset[target_property][test_indices + db_indices] print "\n", "-"*60, "\n"
# -------------------------------------------- # in this case, only sorted eigenvalues of Coulomb matrix ttt = time.clock() eigX = [(eigh(M, eigvals_only=True))[::-1] for M in X] eigt = [(eigh(M, eigvals_only=True))[::-1] for M in Xtest] print "TIMER eval_features", time.clock() - ttt # Observations y = T.ravel() alpha = [] covmat = [] for theta0 in [10.0**i for i in sp.linspace(-2,5,7)]: # sp.linspace(1,1, N_models): # Setup a Gaussian Process model ttt = time.clock() gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=1e-3, verbose=True, low_memory=False) # Fit to data using Maximum Likelihood Estimation of the parameters gp.fit(eigX, y) print "TIMER teach", time.clock() - ttt ttt = time.clock() # # Make the prediction on training set # y_pred, MSE = gp.predict(eigX, eval_MSE=True) # sigma = sp.sqrt(MSE) # print('\n training set:') # print('MAE: %5.2f kcal/mol' % sp.absolute(y_pred-y).mean(axis=0)) # print('RMSE: %5.2f kcal/mol' % sp.square(y_pred-y).mean(axis=0)**.5) # Make the prediction on test set y_pred, MSE = gp.predict(eigt, eval_MSE=True) sigma = sp.sqrt(MSE)
eigX = [(eigh(M, eigvals_only=True))[::-1] for M in X] eigt = [(eigh(M, eigvals_only=True))[::-1] for M in Xtest] print "TIMER eval_features", time.clock() - ttt # Observations y = T.ravel() alpha = [] covmat = [] for theta0 in [10.0**i for i in sp.linspace(-2, 5, 7)]: # sp.linspace(1,1, N_models): # Setup a Gaussian Process model ttt = time.clock() gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=1e-3, verbose=True, low_memory=False) # Fit to data using Maximum Likelihood Estimation of the parameters gp.fit(eigX, y) print "TIMER teach", time.clock() - ttt ttt = time.clock() # # Make the prediction on training set # y_pred, MSE = gp.predict(eigX, eval_MSE=True) # sigma = sp.sqrt(MSE) # print('\n training set:') # print('MAE: %5.2f kcal/mol' % sp.absolute(y_pred-y).mean(axis=0)) # print('RMSE: %5.2f kcal/mol' % sp.square(y_pred-y).mean(axis=0)**.5) # Make the prediction on test set
# -------------------------------------------- # Load all database # -------------------------------------------- ttt = time.clock() dataset = pickle.load(open(dataset_loc, 'r')) print "TIMER load_data", time.clock() - ttt test_indices_rec, teach_indices_rec = [], [] alpha_predicted, alpha_target = [], [] energy_target, energy_error = [], [] # -------------------------------------------- # Setup a Gaussian Process # -------------------------------------------- gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) gp_level2 = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0_level2]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) # -------------------------------------------- # Loop over different training sets of the same size # -------------------------------------------- for iteration in range(Ndatabases): # -------------------------------------------- # Pick Ntest configurations randomly # -------------------------------------------- test_indices = list(sp.random.randint(0, high=dataset[target_property].size, size=Ntest)) db_indices = randint_norepeat(0, exclude=test_indices, high=dataset[target_property].size, size=Nteach) sp.save('db_indices_%d-%s' % (iteration, time.ctime()), db_indices) teach_indices_rec.append(db_indices)
y = sp.spatial.distance.pdist(y[:,None]) frequency, bins, patches = plt.hist(y, bins=53, normed=True) bins_dummy = list(bins) bins_dummy.append(bins_dummy.pop(0)) bins = ((bins + sp.asarray(bins_dummy)) / 2)[:-1] histograms.append(sp.row_stack((bins, frequency))) # -------------------------------------------- # Setup a Gaussian Process # -------------------------------------------- theta0 = 1.0e1 nugget = 1.0e-15 normalise = 1 metric = 'cityblock' gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) eigX = [(eigh(M, eigvals_only=True))[::-1] for M in dataset['X']] gp.calc_kernel_matrix(eigX) d = sp.spatial.distance.squareform(gp.D) frequency, bins, patches = plt.hist(y, bins=47, normed=True) bins_dummy = list(bins) bins_dummy.append(bins_dummy.pop(0)) bins = ((bins + sp.asarray(bins_dummy)) / 2)[:-1] plt.clf() for i, h in enumerate(histograms): plt.plot(h[0], h[1],'-', label=target_properties[i]) plt.plot(bins, frequency, 'o', label='eigenvalues of Coulomb matrix') plt.xlabel("distance in normalised property space") plt.ylabel("frequency")
bins_dummy.append(bins_dummy.pop(0)) bins = ((bins + sp.asarray(bins_dummy)) / 2)[:-1] histograms.append(sp.row_stack((bins, frequency))) # -------------------------------------------- # Setup a Gaussian Process # -------------------------------------------- theta0 = 1.0e1 nugget = 1.0e-15 normalise = 1 metric = 'cityblock' gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) eigX = [(eigh(M, eigvals_only=True))[::-1] for M in dataset['X']] gp.calc_kernel_matrix(eigX) d = sp.spatial.distance.squareform(gp.D) frequency, bins, patches = plt.hist(y, bins=47, normed=True) bins_dummy = list(bins) bins_dummy.append(bins_dummy.pop(0)) bins = ((bins + sp.asarray(bins_dummy)) / 2)[:-1] plt.clf() for i, h in enumerate(histograms): plt.plot(h[0], h[1], '-', label=target_properties[i])
ttt = time.clock() dataset = pickle.load(open(dataset_loc, 'r')) print "TIMER load_data", time.clock() - ttt test_indices_rec, teach_indices_rec = [], [] alpha_predicted, alpha_target = [], [] energy_target, energy_error = [], [] # -------------------------------------------- # Setup a Gaussian Process # -------------------------------------------- gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) gp_level2 = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0_level2]), nugget=nugget, verbose=True, normalise=normalise, do_features_projection=False, low_memory=False, metric=metric) # -------------------------------------------- # Loop over different training sets of the same size # --------------------------------------------