def read_xyz_n_dm(fxyz, design_matrix, use_atomic_descriptors, only_use_species, peratom): dm = [] dm_atomic = [] # try to read the xyz file if fxyz is not None and fxyz != 'none': from asaplib.data import ASAPXYZ # from asapxyzs import ASAPXYZs asapxyz = ASAPXYZ(fxyz) if use_atomic_descriptors: dm = asapxyz.get_atomic_descriptors(design_matrix, only_use_species) else: dm, dm_atomic = asapxyz.get_descriptors(design_matrix, peratom) else: asapxyz = None print("Did not provide the xyz file. We can only output descriptor matrix.") # we can also load the descriptor matrix from a standalone file import os if os.path.isfile(design_matrix[0]): try: import numpy as np dm = np.genfromtxt(design_matrix[0], dtype=float) print("loaded the descriptor matrix from file: ", design_matrix[0]) except: raise ValueError('Cannot load the descriptor matrix from file') return asapxyz, dm, dm_atomic
def main(): """ Test if Ridge regression is working. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix """ fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') fmat = ['SOAP-n4-l3-c1.9-g0.23'] fy = 'dft_formation_energy_per_atom_in_eV' prefix = "test-skrr" test_ratio = 0.05 lc_points = 8 lc_repeats = 8 # try to read the xyz file asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat, False) y_all = asapxyz.get_property(fy) # print(desc) dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio) # kernel, jitter, delta, sigma, sparse_mode="fps", n_sparse=None k_spec = { 'k0': { "type": "linear" } } # { 'k1': {"type": "polynomial", "d": power}} # if sigma is not set... sigma = 0.001 * np.std(y_all) krr = KRRSparse(0., None, sigma) skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode="fps", n_sparse=-1) # fit the model dm.compute_fit(skrr, 'skrr', store_results=True, plot=True) # learning curve if lc_points > 1: dm.compute_learning_curve(skrr, 'ridge_regression', lc_points=lc_points, lc_repeats=lc_repeats, randomseed=42, verbose=False) dm.save_state(prefix) plt.show()
def main(): """ Test if Ridge regression is working. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix """ fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') fmat = ['SOAP-n4-l3-c1.9-g0.23'] fy = 'dft_formation_energy_per_atom_in_eV' prefix = "test-rr" test_ratio = 0.05 lc_points = 8 lc_repeats = 8 # try to read the xyz file asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat, False) y_all = asapxyz.get_property(fy) # print(desc) dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio) # if sigma is not set... sigma = 0.001 * np.std(y_all) rr = RidgeRegression(sigma) # fit the model dm.compute_fit(rr, 'ridge_regression', store_results=True, plot=True) # learning curve if lc_points > 1: dm.compute_learning_curve(rr, 'ridge_regression', lc_points=lc_points, lc_repeats=lc_repeats, randomseed=42, verbose=False) dm.save_state(prefix) plt.show()
def main(): """ Select frames from the supplied xyz file (fxyz) using one of the following algorithms: 1. random: random selection 2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat 4. CUR decomposition Parameters ---------- fxyz: Path to xyz file. fmat: Path to the design matrix or name of the tags in ase xyz file prefix: Filename prefix, default is ASAP nkeep: The number of representative samples to select algorithm: 'the algorithm for selecting frames ([random], [fps], [cur])') fmat: Location of descriptor or kernel matrix file. Needed if you select [fps] or [cur]. """ fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') fmat = ['SOAP-n4-l3-c1.9-g0.23'] nkeep = 10 prefix = "test-frame-select" # read the xyz file asapxyz = ASAPXYZ(fxyz) # for both algo we read in the descriptor matrix desc, _ = asapxyz.get_descriptors(fmat) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) for algorithm in ['random', 'cur', 'fps']: sparsifier = Sparsifier(algorithm) sbs = sparsifier.sparsify(desc, nkeep) # save selection = np.zeros(asapxyz.get_num_frames(), dtype=int) for i in sbs: selection[i] = 1 np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index', selection, fmt='%d') asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
def main(fmat, fxyz, fy, prefix, scale, test_ratio, sigma, lc_points, lc_repeats): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure scale: Scale the coordinates (True/False). Scaling highly recommanded. test_ratio: train/test ratio sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data. lc_points : number of points on the learning curve lc_repeats : number of sub-sampling when compute the learning curve Returns ------- Learning curve. """ scale = bool(scale) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat) # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') if len(desc) == 0: raise ValueError( 'Please supply descriptor in a xyz file or a standlone descriptor matrix' ) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) # read in the properties to be predicted y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: y_all = asapxyz.get_property(fy) dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio) # if sigma is not set... if sigma < 0: sigma = 0.001 * np.std(y_all) rr = RidgeRegression(sigma) # fit the model dm.compute_fit(rr, 'ridge_regression', store_results=True, plot=True) # learning curve if lc_points > 1: lc_scores = dm.compute_learning_curve(rr, 'ridge_regression', lc_points=lc_points, lc_repeats=lc_repeats, randomseed=42, verbose=False) # make plot lc_scores.plot_learning_curve() plt.show()
def main(fmat, fxyz, ftags, prefix, dimension, pc1, pc2, adtext): """ Parameters ---------- fmat: Location of low-dimensional coordinate file. ftags: Location of tags for the first M samples. prefix: Filename prefix. dimension: Number of the first X dimensions to keep pc1: First principle axis (int) pc2: Second principle axis (int) adtext: Boolean giving whether to adjust text or not. Returns ------- """ # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat) if os.path.isfile(fmat): try: desc = np.genfromtxt(fmat, dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') if len(desc) == 0: raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix') print("loaded", fmat, " with shape", np.shape(desc)) # load tags if any if ftags != 'none': tags = np.loadtxt(ftags, dtype="str") ndict = len(tags) proj = np.asmatrix(desc)[:, 0:dimension] density_model = KDE_internal() # KDE_sklearn(bandwidth=1) # KDE_scipy() # fit density model to data try: density_model.fit(proj) except: raise RuntimeError('KDE did not work. Try smaller dimension.') rho = density_model.evaluate_density(proj) # save the density np.savetxt(prefix + "-kde.dat", np.transpose([np.arange(len(rho)), rho]), header='index log_of_kernel_density_estimation', fmt='%d %4.8f') # color scheme plotcolor = rho colorlabel = 'Log of density for every point' [plotcolormin, plotcolormax] = [np.min(plotcolor), np.max(plotcolor)] # make plot plot_styles.set_nice_font() # density plot fig, ax = plot_styles.plot_density_map(np.asarray(proj[:, [pc1, pc2]]), plotcolor, xlabel='Princple Axis ' + str(pc1), ylabel='Princple Axis ' + str(pc2), clabel=colorlabel, label=None, xaxis=True, yaxis=True, centers=None, psize=None, out_file=None, title='KDE for: ' + prefix, show=False, cmap='gnuplot', remove_tick=False, use_perc=False, rasterized=True, fontsize=15, vmax=plotcolormax, vmin=plotcolormin) fig.set_size_inches(18.5, 10.5) if ftags != 'none': texts = [] for i in range(ndict): if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '': ax.scatter(proj[i, pc1], proj[i, pc2], marker='^', c='black') texts.append(ax.text(proj[i, pc1], proj[i, pc2], tags[i], ha='center', va='center', fontsize=15, color='red')) # ax.annotate(tags[i], (proj[i,pc1], proj[i,pc2])) if adtext: from adjustText import adjust_text adjust_text(texts, on_basemap=True, # only_move={'points':'', 'text':'x'}, expand_text=(1.01, 1.05), expand_points=(1.01, 1.05), force_text=(0.03, 0.5), force_points=(0.01, 0.25), ax=ax, precision=0.01, arrowprops=dict(arrowstyle="-", color='black', lw=1, alpha=0.8)) plt.show() fig.savefig('kde_4_' + prefix + '.png')
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, kpca_d, pc1, pc2, adtext): """ Parameters ---------- fmat fxyz ftags fcolor colorscol prefix output kpca_d: number of dimensions pc1 pc2 adtext Returns ------- """ foutput = prefix + "-kpca-d" + str(kpca_d) # load the kernel matrix try: kNN = np.genfromtxt(fmat, dtype=float) except: raise ValueError('Cannot load the kernel matrix') print("loaded", fmat) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str") if tags.ndim > 1: tags = tags[:, 0] ndict = len(tags) asapxyz = None # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) elif output == 'xyz': print( "Did not provide the xyz file. We can only output descriptor matrix." ) output = 'matrix' # main thing proj = KernelPCA(kpca_d).fit_transform(kNN) # save if output == 'matrix': np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') elif output == 'xyz': if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") asapxyz.set_descriptors(proj, 'kpca_coord') asapxyz.write(foutput) # color scheme plotcolor, colorlabel, colorscale = set_color_function( fcolor, asapxyz, colorscol, len(proj)) # make plot plot_styles.set_nice_font() # fig, ax = plt.subplots() fig, ax = plot_styles.plot_density_map(proj[:, [pc1, pc2]], plotcolor, xlabel='Principal Axis ' + str(pc1), ylabel='Principal Axis ' + str(pc2), clabel=colorlabel, label=None, xaxis=True, yaxis=True, centers=None, psize=None, out_file=None, title='KPCA for: ' + prefix, show=False, cmap='gnuplot', remove_tick=False, use_perc=True, rasterized=True, fontsize=15, vmax=colorscale[1], vmin=colorscale[0]) fig.set_size_inches(18.5, 10.5) if ftags != 'none': texts = [] for i in range(ndict): if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '': ax.scatter(proj[i, pc1], proj[i, pc2], marker='^', c='black') texts.append( ax.text(proj[i, pc1], proj[i, pc2], tags[i], ha='center', va='center', fontsize=15, color='red')) # ax.annotate(tags[i], (proj[i,pc1], proj[i,pc2])) if adtext: from adjustText import adjust_text adjust_text( texts, on_basemap=True, # only_move={'points':'', 'text':'x'}, expand_text=(1.01, 1.05), expand_points=(1.01, 1.05), force_text=(0.03, 0.5), force_points=(0.01, 0.25), ax=ax, precision=0.01, arrowprops=dict(arrowstyle="-", color='black', lw=1, alpha=0.8)) plt.show() fig.savefig('KPCA_4_' + prefix + '-c-' + fcolor + '.png')
parser.add_argument("--stride","-s",type=int,help="stride for asap gen_desc command; this function will be deprecated") parser.add_argument("--method","-m",type=str,default='fps',help="method, 3 options: 'random', 'cur', 'fps'") args = parser.parse_args() #dirctory = '/Users/jiedeng/Documents/tmp/jd848/project_folder/liquid_vapor/water1/r6-6k/cont1/asap' #fxyz = dirctory+'/ASAP-desc.xyz' fxyz = args.input #fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') # fmat = ['SOAP-n4-l3-c1.9-g0.23'] fmat = ['*'] nkeep = args.number #50 prefix = "test-frame-select" # read the xyz file asapxyz = ASAPXYZ(fxyz) # for both algo we read in the descriptor matrix desc, _ = asapxyz.get_descriptors(fmat) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) algorithm = args.method#'fps' # 3 options: 'random', 'cur', 'fps' #algorithm = 'random' # 3 options: 'random', 'cur', 'fps' sparsifier = Sparsifier(algorithm) sbs = sparsifier.sparsify(desc, nkeep) sbs.sort() if args.stride is None: pass else: sbs = sbs*args.stride
def main(fxyz, dictxyz, prefix, output, peratom, fsoap_param, soap_rcut, soap_g, soap_n, soap_l, zeta_list, kernel_type, element_wise, soap_periodic, stride): """ Generate the SOAP descriptors. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix output: [xyz]: append the SOAP descriptors to extended xyz file; [mat] output as a standlone matrix fsoap_param: use (possibly multiple sets) of SOAP descriptors using parameters specified in fsoap_param file (json format) soap_rcut: float giving the cutoff radius, default value is 3.0 soap_g: float giving the atom width soap_n: int giving the maximum radial label soap_l: int giving the maximum angular label. Must be less than or equal to 9 zeta_list : get the global descriptor from atomic ones of zeta th power kernel_type: type of operations to get global descriptors from the atomic soap vectors elementwise: consider different species seperately when computing global descriptors from the atomic soap vectors soap_periodic: string (True or False) indicating whether the system is periodic stride: compute descriptor each X frames """ # read frames asapxyz = ASAPXYZ(fxyz) if fsoap_param is not None: import json # load the parameter from json file if os.path.isfile(fsoap_param): try: with open(fsoap_param, 'r') as soapfile: soap_js = json.load(soapfile) except: raise IOError('Cannot load the json file for soap parameters') # use the default parameters else: soap_js = universal_soap_hyper(global_species, fsoap_param, dump=True) # make descriptors soap_desc_atomic = [] for element in soap_js.keys(): soap_param = soap_js[element] [species_now, cutoff_now, g_now, n_now, l_now] = [soap_param['species'], soap_param['cutoff'], soap_param['atom_gaussian_width'], soap_param['n'], soap_param['l']] soap_desc_atomic.append(SOAP(species=species_now, rcut=cutoff_now, nmax=n_now, lmax=l_now, sigma=g_now, rbf="gto", crossover=False, average=False, periodic=soap_periodic)) foutput = prefix + "-soapparam" + '-' + fsoap_param desc_name = "SOAPPARAM" + '-' + fsoap_param else: soap_desc_atomic = [SOAP(species=global_species, rcut=soap_rcut, nmax=soap_n, lmax=soap_l, sigma=soap_g, rbf="gto", crossover=False, average=False, periodic=soap_periodic)] foutput = prefix + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" + str(soap_rcut) + "-g" + str(soap_g) desc_name = "SOAP" + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" + str(soap_rcut) + "-g" + str(soap_g) for i, frame in enumerate(frames): fnow = soap_desc_atomic[0].create(frame, n_jobs=8) for soap_desc_atomic_now in soap_desc_atomic[1:]: fnow = np.append(fnow, soap_desc_atomic_now.create(frame, n_jobs=8), axis=1) if kernel_type == 'average' and element_wise == False and len(zeta_list)==1 and zeta_list[0]==1: # this is the vanilla situation. We just take the average soap for all atoms frame.info[desc_name] = Atomic_2_Global_Descriptor_By_Species(fnow, [], [], kernel_type, zeta_list) elif element_wise == False: frame.info[desc_name+'-'+kernel_type] = Atomic_2_Global_Descriptor_By_Species(fnow, [], [], kernel_type, zeta_list) else: frame.info[desc_name+'-'+kernel_type+'-elementwise'] = Atomic_2_Global_Descriptor_By_Species(fnow, frame.get_atomic_numbers(), global_species, kernel_type, zeta_list) # save if output == 'matrix': asapxyz.write_descriptor_matrix(desc_name, desc_name) if peratom or nframes == 1: asapxyz.write_atomic_descriptor_matrix(desc_name, desc_name) elif output == 'xyz': asapxyz.write(foutput) else: raise ValueError('Cannot find the output format')
def load_asapxyz(data_spec): from asaplib.data import ASAPXYZ return ASAPXYZ(data_spec['fxyz'], data_spec['stride'], data_spec['periodic'], data_spec['fxyz_format'])
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma, lc_points, lc_repeats): """ Parameters ---------- fmat: Location of kernel matrix file. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure test_ratio: train/test ratio jitter: jitter level, default is 1e-10 n_sparse: number of representative samples, default is 5% of the data sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data. lc_points : number of points on the learning curve lc_repeats : number of sub-sampling when compute the learning curve Returns ------- Fitting outcome & Learning curve. """ # if it has been computed before we can simply load it try: K_all = np.genfromtxt(fmat, dtype=float) except OSError: raise Exception( 'fmat file could not be loaded. Please check the filename') print("loaded", fmat) # read in the properties to be predicted y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: try: # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) y_all = asapxyz.get_property(fy) except OSError: raise Exception( 'property vector file could not be loaded. Please check the filename' ) if len(y_all) != len(K_all): raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) else: n_sample = len(K_all) # train test split if test_ratio > 0: K_train, K_test, y_train, y_test, _, _ = kernel_random_split( K_all, y_all, test_ratio) else: K_train = K_test = K_all y_train = y_test = y_all n_train = len(K_train) n_test = len(K_test) # set default value of n_sparse if n_sparse == 0: n_sparse = n_train // 20 # sparsification if n_sparse >= n_train: print( "the number of representative structure is too large, please select n < ", n_train) elif n_sparse > 0: ifps, dfps = fps(K_train, n_sparse, 0) K_MM = K_train[:, ifps][ifps] K_NM = K_train[:, ifps] K_TM = K_test[:, ifps] else: print("it's usually better to use some sparsification") K_MM = K_train K_NM = K_train K_TM = K_test # if sigma is not set... if sigma < 0: sigma = 0.001 * np.std(y_train) delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM)) krr = KRRSparse(jitter, delta, sigma) # fit the model krr.fit(K_MM, K_NM, y_train) fit_error = {} # get the predictions for train set y_pred = krr.predict(K_NM) # compute the CV score for the dataset y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM, y_train, K_TM, y_test, verbose=True, return_pred=True) # dump to file import json with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp: json.dump(fit_error, fp) # learning curve # decide train sizes if lc_points > 1 and n_sparse > 0: train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points) print("Learning curves using train sizes: ", train_sizes) lc_stats = lc_repeats * np.ones(lc_points, dtype=int) lc = LCSplit(ShuffleSplit, n_repeats=lc_stats, train_sizes=train_sizes, test_size=n_test, random_state=10) lc_scores = LC_SCOREBOARD(train_sizes) for lctrain, _ in lc.split(y_train): Ntrain = len(lctrain) lc_K_NM = K_NM[lctrain, :] lc_y_train = y_train[lctrain] # here we always use the same test set # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]` krr.fit(K_MM, lc_K_NM, lc_y_train) # here we always use the same test set _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train, K_TM, y_test) lc_scores.add_score(Ntrain, lc_score_now) sc_name = 'RMSE' # MAE, RMSE, SUP, R2, CORR lc_results = lc_scores.fetch(sc_name) # output learning curve np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results) plot_styles.set_nice_font() if lc_points > 1 and n_sparse > 0: fig = plt.figure(figsize=(8 * 2.1, 8)) ax = fig.add_subplot(121) else: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.plot(y_train, y_pred, 'b.', label='train') ax.plot(y_test, y_pred_test, 'r.', label='test') ax.legend() ax.set_title('KRR for: ' + fy) ax.set_xlabel('actual y') ax.set_ylabel('predicted y') if lc_points > 1 and n_sparse > 0: ax2 = fig.add_subplot(122) ax2.errorbar(lc_results[:, 0], lc_results[:, 1], yerr=lc_results[:, 2], linestyle='', uplims=True, lolims=True) ax2.set_title('Learning curve') ax2.set_xlabel('Number of training samples') ax2.set_ylabel('Test {}'.format(sc_name)) ax2.set_xscale('log') ax2.set_yscale('log') plt.show() fig.savefig('KRR_4_' + prefix + '.png')
r1_tag = np.ones(100).astype(int) r3_tag = (np.ones(250) * 0).astype(int) ppv_tag = (np.ones(75) * 2).astype(int) r1_r3_tag = np.concatenate((r1_tag, r3_tag)) r1_r3_ppv_tag = np.concatenate((r1_tag, r3_tag, ppv_tag)) np.savetxt('tag', r1_r3_tag, fmt='%d') np.savetxt('tag_ppv', r1_r3_ppv_tag, fmt='%d') #fmat = 'pca_coord' fmat = 'skpca-d-10' #fmat = '[*]' asapxyz = ASAPXYZ(fxyz) dm, _ = asapxyz.get_descriptors(fmat, False) dm_mg = asapxyz.get_atomic_descriptors(fmat, 12) dm_oxygen = asapxyz.get_atomic_descriptors(fmat, 8) dm_silicon = asapxyz.get_atomic_descriptors(fmat, 14) plotcolor_volume, _, _, _ = set_color_function('volume', asapxyz) plotcolor_density = np.zeros(len(plotcolor_volume)) for i in range(len(plotcolor_volume)): plotcolor_density[i] = 29.889703 / plotcolor_volume[i] / 3. #tags = np.loadtxt('ice-54-labels.dat', dtype="str")[:,0] #iceornot_hydrogen, _, _, _ = set_color_function('ice-or-not.tag', asapxyz, 0, 0, False, True, 1, False) iceornot_oxygen, _, _, _ = set_color_function('tag', asapxyz, 0, 0, False,
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, scale, pca_d, pc1, pc2, projectatomic, plotatomic, adtext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples. Plot the tags on the PCA map. fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot' colorscol: The column number of the properties used for the coloring. Starts from 0. prefix: Filename prefix, default is ASAP output: The format for output files ([xyz], [matrix]). Default is xyz. peratom: Whether to output per atom pca coordinates (True/False) keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False) scale: Scale the coordinates (True/False). Scaling highly recommanded. pca_d: Number of the principle components to keep pc1: Plot the projection along which principle axes pc2: Plot the projection along which principle axes projectatomic: build the projection using the (big) atomic descriptor matrix plotatomic: Plot the PCA coordinates of all atomic environments (True/False) adtext: Whether to adjust the texts (True/False) Returns ------- """ foutput = prefix + "-pca-d" + str(pca_d) use_atomic_desc = (peratom or plotatomic or projectatomic) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print("Did not provide the xyz file. We can only output descriptor matrix.") output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') # sanity check if len(desc) == 0: raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix') print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str")[:] ndict = len(tags) else: tags = [] reduce_dict = { "pca": {"type": 'PCA', 'parameter':{"n_components": pca_d, "scalecenter": scale}} } """ reduce_dict = { "umap": {"type": 'UMAP', 'parameter':{"n_components": pca_d, "n_neighbors": 10}} } reduce_dict = { "reduce1_pca": {"type": 'PCA', 'parameter':{"n_components": 20, "scalecenter":True}}, "reduce2_tsne": {"type": 'TSNE', 'parameter': {"n_components": 2, "perplexity":20}} } """ dreducer = Dimension_Reducers(reduce_dict) proj = dreducer.fit_transform(desc) if peratom or plotatomic and not projectatomic: proj_atomic_all = dreducer.transform(desc_atomic) # save if output == 'matrix': np.savetxt(foutput + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') if peratom: np.savetxt(foutput + "-atomic.coord", proj_atomic_all, fmt='%4.8f', header='low D coordinates of samples') if output == 'xyz': asapxyz.set_descriptors(proj, 'pca_coord') if peratom: asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord') # remove the raw descriptors if not keepraw: asapxyz.remove_descriptors(fmat) asapxyz.remove_atomic_descriptors(fmat) asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic) if plotatomic: outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png' else: outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '.png' fig_spec_dict = { 'outfile': outfile, 'show': False, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components':{ "first_p": {"type": 'scatter', 'clabel': colorlabel}, "second_p": {"type": 'annotate', 'adtext': adtext} } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags) if peratom or plotatomic and not projectatomic: asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]], plotcolor_peratom[::-1],[],[]) plt.show()
# make tags if ele == 14 or ele == 12: n_atoms = 32 n_frames = get_nframes(fxyz_recal_om8, 'xyz') tag0 = sum(np.array(n_frames).astype(int)) * n_atoms * [0] n_frames = get_nframes(fxyz_recal_pv, 'xyz') tag1 = sum(np.array(n_frames).astype(int)) * n_atoms * [1] tags = tag0 + tag1 # tag0=(n_atoms*9 + n_atoms*250)*[0] + n_atoms*100*[1] ########### ########### The following is equivalent to above and have ########### asapxyz = ASAPXYZ(fxyz) reduce_dict = {} reduce_dict["preprocessing"] = {"type": 'SCALE', 'parameter': None} reduce_dict['skpca'] = { "type": 'SPARSE_KPCA', 'parameter': { "n_components": 3, "n_sparse": -1, # no sparsification # "scale":True, "kernel": { "first_kernel": { "type": 'linear' } } }
def main(fxyz, prefix): """ Test if computing descriptors is working. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix """ # read frames asapxyz = ASAPXYZ(fxyz, 1, False) # not periodic peratom = True tag = 'test' soap_js = { 'soap1': { 'type': 'SOAP', 'cutoff': 2.0, 'n': 2, 'l': 2, 'atom_gaussian_width': 0.2, 'rbf': 'gto', 'crossover': False } } acsf_js = { 'acsf1': { 'type': 'ACSF', 'cutoff': 2.0, 'g2_params': [[1, 1], [1, 2], [1, 3]], 'g4_params': [[1, 1, 1], [1, 2, 1], [1, 1, -1], [1, 2, -1]] } } k2_js = { 'lmbtr-k2': { 'type': 'LMBTR_K2', 'k2': { "geometry": { "function": "distance" }, "grid": { "min": 0, "max": 2, "n": 10, "sigma": 0.1 }, "weighting": { "function": "exponential", "scale": 0.5, "cutoff": 1e-3 } }, 'periodic': False, 'normalization': "l2_each" } } kernel_js = {} kernel_js['k1'] = { 'reducer_type': 'moment_average', 'zeta': 2, 'element_wise': False } kernel_js['k2'] = {'reducer_type': 'sum', 'element_wise': True} desc_spec_js = { 'test_cm': { 'type': "CM" }, 'test_soap': { 'atomic_descriptor': soap_js, 'reducer_function': kernel_js }, 'test_acsf': { 'atomic_descriptor': acsf_js, 'reducer_function': kernel_js }, 'test_k2': { 'atomic_descriptor': k2_js, 'reducer_function': kernel_js } } # compute the descripitors asapxyz.compute_global_descriptors(desc_spec_js, [], peratom, tag) asapxyz.write_computed_descriptors(prefix, ['test_cm', 'test_soap'], [0]) asapxyz.write(prefix) asapxyz.save_state(tag)
def main(fmat, kmat, fxyz, ftags, prefix, fcolor, colorscol, dimension, pc1, pc2, algorithm, projectatomic, adtext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. kmat: Location of the kernel matrix. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples prefix: Filename prefix. Default is ASAP. fcolor: Properties for all samples (N floats) used to color the scatter plot,[filename/rho/cluster] colorscol: The column number of the properties used for the coloring. Starts from 0. dimension: The number of principle components to keep pc1: int, default is 0, which principle axis to plot the projection on pc2: int, default is 1, which principle axis to plot the projection on algorithm: the algorithm for density-based clustering options are: ([dbscan], [fdb]) projectatomic: build the projection using the (big) atomic descriptor matrix adtext: Whether to adjust the text (True/False) Returns ------- cluster labels, PCA plots """ if fmat == 'none' and kmat == 'none': raise ValueError('Must provide either the low-dimensional coordinates fmat or the kernel matrix kmat') # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, projectatomic) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print("Did not provide the xyz file. We can only output descriptor matrix.") output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') if kmat != 'none': try: kNN = np.genfromtxt(kmat, dtype=float) print("loaded kernal matrix", kmat, "with shape", np.shape(kNN)) desc = kerneltodis(kNN) except: raise ValueError('Cannot load the coordinates') if ftags != 'none': tags = np.loadtxt(ftags, dtype="str") ndict = len(tags) # now we do the clustering if algorithm == 'dbscan': # we compute the characteristic bandwidth of the data # first select a subset of structures (20) sbs = np.random.choice(np.asarray(range(len(desc))), 50, replace=False) # the characteristic bandwidth of the data sigma_kij = np.percentile(cdist(desc[sbs], desc, 'euclidean'), 100*10./len(desc)) trainer = sklearn_DB(sigma_kij, 5, 'euclidean') # adjust the parameters here! do_clustering = DBCluster(trainer) do_clustering.fit(desc) elif algorithm == 'fdb' or algorithm == 'FDB': trainer = LAIO_DB() do_clustering = DBCluster(trainer) do_clustering.fit(desc) else: raise ValueError('Please select from fdb or dbscan') print(do_clustering.pack()) #with open("clustering_results_4_" + prefix + ".json", 'w') as fp: # json.dump(do_clustering.pack(), fp, cls=NpEncoder) labels_db = do_clustering.get_cluster_labels() n_clusters = do_clustering.get_n_cluster() if asapxyz is not None and projectatomic: asapxyz.set_atomic_descriptors(labels_db, 'cluster_label') elif asapxyz is not None: asapxyz.set_descriptors(labels_db, 'cluster_label') # save np.savetxt(prefix + "-cluster-label.dat", np.transpose([np.arange(len(labels_db)), labels_db]), header='index cluster_label', fmt='%d %d') if fmat != 'none': pca = PCA(dimension, True) proj = pca.fit_transform(desc) elif kmat != 'none': proj = KernelPCA(dimension).fit_transform(kNN) # color scheme if fcolor == 'cluster_label': plotcolor = labels_db colorlabel = 'cluster_label' else: if projectatomic: _, plotcolor, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, True) else: plotcolor, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, len(proj), False) print(labels_db[::-1]) outfile = 'Clustering_4_' + prefix + '.png' # make plot fig_spec_dict = { 'outfile': outfile, 'show': True, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components':{ "first_p": {"type": 'scatter', 'clabel': colorlabel}, #"second_p": {"type": 'annotate', 'adtext': adtext}, "third_p": {"type": 'cluster', 'w_label': True, 'circle_size': 20} } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], labels_db[::-1], []) #tags)
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, scale, tsne_d, dim1, dim2, perplexity, projectatomic, plotatomic, adtext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples. Plot the tags on the t-SNE map. fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot' colorscol: The column number of the properties used for the coloring. Starts from 0. prefix: Filename prefix, default is ASAP output: The format for output files ([xyz], [matrix]). Default is xyz. peratom: Whether to output per atom t-SNE coordinates (True/False) keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False) scale: Scale the coordinates (True/False). Scaling highly recommanded. tsne_d: Dimension of the embedded space. dim1: Plot the projection along which principle axes dim2: Plot the projection along which principle axes projectatomic: build the projection using the (big) atomic descriptor matrix perplexity: Perplexity setting for t-SNE: Typical values between 5 and 50. plotatomic: Plot the PCA coordinates of all atomic environments (True/False) adtext: Whether to adjust the texts (True/False) Returns ------- """ foutput = prefix + "-pca-d" + str(tsne_d) use_atomic_desc = (peratom or plotatomic or projectatomic) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print("Did not provide the xyz file. We can only output descriptor matrix.") output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') # sanity check if len(desc) == 0: raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix') print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str")[:] ndict = len(tags) else: tags = [] # scale & center if scale: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() print('Shape of descriptor matrix is {}'.format(desc.shape)) print(scaler.fit(desc)) desc = scaler.transform(desc) # normalizing the features # fit t-SNE if desc.shape[1] >= 50: # pre-process with PCA if dim > 50 # suggested here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html pca = PCA(n_components=50) desc = pca.fit_transform(desc) print('Shape of processed descriptor matrix after applying PCA is {}'.format(desc.shape)) tsne = TSNE(n_components=tsne_d, perplexity=perplexity) proj = tsne.fit_transform(desc) if peratom or plotatomic and not projectatomic: raise NotImplementedError #proj_atomic_all = tsne.transform(desc_atomic) # save if output == 'matrix': np.savetxt(foutput + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') if output == 'xyz': if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") asapxyz.set_descriptors(proj, 'pca_coord') if peratom: asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord') # remove the raw descriptors if not keepraw: asapxyz.remove_descriptors(fmat) asapxyz.remove_atomic_descriptors(fmat) asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic) # make plot plot_styles.set_nice_font() fig, ax = plt.subplots() if plotatomic and not projectatomic: # notice that we reverse the list of coordinates, in order to make the structures in the dictionary more obvious fig, ax = plot_styles.plot_density_map(proj_atomic_all[::-1, [dim1, dim2]], plotcolor_peratom[::-1], fig, ax, xlabel='Principal Axis ' + str(dim1), ylabel='Principal Axis ' + str(dim2), clabel=None, label=None, xaxis=True, yaxis=True, centers=None, psize=None, out_file=None, title=None, show=False, cmap='gnuplot', remove_tick=False, use_perc=False, rasterized=True, fontsize=15, vmax=colorscale[1], vmin=colorscale[0]) fig, ax = plot_styles.plot_density_map(proj[::-1, [dim1, dim2]], plotcolor[::-1], fig, ax, xlabel='Principal Axis ' + str(dim1), ylabel='Principal Axis ' + str(dim2), clabel=colorlabel, label=None, xaxis=True, yaxis=True, centers=None, psize=None, out_file=None, title='t-SNE for: ' + prefix, show=False, cmap='gnuplot', remove_tick=False, use_perc=False, rasterized=True, fontsize=15, vmax=colorscale[1], vmin=colorscale[0]) fig.set_size_inches(160.5, 80.5) if ftags != 'none': texts = [] for i in range(ndict): if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '': ax.scatter(proj[i, dim1], proj[i, dim2], marker='^', c='black') texts.append(ax.text(proj[i, dim1], proj[i, dim2], tags[i], ha='center', va='center', fontsize=10, color='red')) if adtext: from adjustText import adjust_text adjust_text(texts, on_basemap=True, # only_move={'points':'', 'text':'x'}, expand_text=(1.01, 1.05), expand_points=(1.01, 1.05), force_text=(0.03, 0.5), force_points=(0.01, 0.25), ax=ax, precision=0.01, arrowprops=dict(arrowstyle="-", color='black', lw=1, alpha=0.8)) plt.show() if plotatomic: fig.savefig('t-SNE_4_' + prefix + '-c-' + fcolor + '-plotatomic.png') else: fig.savefig('t-SNE_4_' + prefix + '-c-' + fcolor + '.png')
def main(fxyz, fy, prefix, nkeep, algorithm, fmat, fkde, reweight_lambda): """ Select frames from the supplied xyz file (fxyz) using one of the following algorithms: 1. random: random selection 2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat 3. sortmin/sortmax: select the frames with the largest/smallest value. Need to supply the vector of properties using -fy 4. CUR decomposition 5. Reweight according to the re-weighted distribution exp(-f/\lambda), where exp(-f) is the precomputed kernel density estimation of the original samples. Parameters ---------- fxyz: Path to xyz file. fy: Path to the list of properties (N floats) or name of the tags in ase xyz file prefix: Filename prefix, default is ASAP nkeep: The number of representative samples to select algorithm: 'the algorithm for selecting frames ([random], [fps], [sort], [reweight])') fmat: Location of descriptor or kernel matrix file. Needed if you select [fps]. You can use gen_kmat.py to compute it. reweight_lambda: select samples according to the re-weighted distribution exp(-f/\lambda), where exp(-f) is the kernel density estimation of the original samples. """ # read the xyz file asapxyz = ASAPXYZ(fxyz) nframes = asapxyz.get_num_frames() if nkeep == 0: nkeep = nframes if fy != 'none': y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: y_all = asapxyz.get_property(fy) if len(y_all) != nframes: raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) if algorithm == 'random' or algorithm == 'RANDOM': idx = np.asarray(range(nframes)) sbs = np.random.choice(idx, nkeep, replace=False) elif algorithm == 'sortmax' or algorithm == 'sortmin': if fy == 'none': raise ValueError( 'must supply the vector of properties for sorting') idx = np.asarray(range(nframes)) if algorithm == 'sortmax': sbs = [x for _, x in sorted(zip(y_all, idx))][:nkeep] elif algorithm == 'sortmin': sbs = [x for _, x in sorted(zip(y_all, idx))][nkeep:] elif algorithm == 'fps' or algorithm == 'FPS' or algorithm == 'cur' or algorithm == 'CUR': # for both algo we read in the descriptor matrix desc, _ = asapxyz.get_descriptors(fmat) if os.path.isfile(fmat): try: desc = np.genfromtxt(fmat, dtype=float) except: raise ValueError('Cannot load the kernel matrix') print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) # FPS if algorithm == 'fps' or algorithm == 'FPS': sbs, dmax_remain = fps(desc, nkeep, 0) print("Making farthest point sampling selection") np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.error', dmax_remain, fmt='%4.8f', header='the maximum remaining distance in FPS') # CUR decomposition if algorithm == 'cur' or algorithm == 'CUR': desc = np.asmatrix(desc) cov = np.dot(desc, desc.T) print("Making CUR selection") print("shape of the covariance matrix:", np.shape(cov)) sbs, rcov_error = CUR_deterministic(cov, nkeep) np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.error', rcov_error, fmt='%4.8f', header='the remaining error of the covariance matrix') elif algorithm == 'reweight': if os.path.isfile(fkde): try: logkde = np.genfromtxt(fkde, dtype=float)[:, 1] except: raise IOError( 'Cannot load the (log of) kernel density for each sample') if len(logkde) != nframes: raise ValueError( 'mismatch of number of frames and kernel densities') else: raise ValueError( 'must suply the (log of) kernel density for each sample') new_kde = np.zeros(nframes) for i in range(nframes): new_kde[i] = np.exp(logkde[i] / reweight_lambda) / np.exp( logkde[i]) # compute the normalization factor so we expect to select n samples in the end normalization = nkeep / np.sum(new_kde) new_kde *= normalization sbs = [] randomchoice = np.random.rand(nframes) for i in range(nframes): if randomchoice[i] < new_kde[i]: sbs.append(i) algorithm = algorithm + "-lambda-" + str(reweight_lambda) # save selection = np.zeros(nframes, dtype=int) for i in sbs: selection[i] = 1 np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index', selection, fmt='%d') if fy != 'none': np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '-' + fy, np.asarray(y_all)[sbs], fmt='%4.8f') asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
def main(): """ Test if dimensionality reduction is working. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix """ fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') fmat = ['SOAP-n4-l3-c1.9-g0.23'] fcolor = 'dft_formation_energy_per_atom_in_eV' pca_d = 10 prefix = "test-dimensionality-reduction" foutput = prefix + "-pca-d" + str(pca_d) # try to read the xyz file asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat, False) print(desc) """ reduce_dict = { "pca": {"type": 'PCA', 'parameter':{"n_components": pca_d, "scalecenter": scale}} } reduce_dict = { "preprocessing": {"type": 'SCALE', 'parameter': None}, "umap": {"type": 'UMAP', 'parameter':{"n_components": pca_d, "n_neighbors": 10}} } reduce_dict = { "reduce1_pca": {"type": 'PCA', 'parameter':{"n_components": 20, "scalecenter":True}}, "reduce2_tsne": {"type": 'TSNE', 'parameter': {"n_components": 2, "perplexity":20}} } """ reduce_dict = { "preprocessing": {"type": 'SCALE', 'parameter': None}, "skpca": {"type": 'SPARSE_KPCA', 'parameter':{"n_components": pca_d, "kernel": {"first_kernel": {"type": 'linear', "normalize": True}} } } } dreducer = Dimension_Reducers(reduce_dict) proj = dreducer.fit_transform(desc) # save asapxyz.set_descriptors(proj, 'pca_coord') asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz) outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '.png' fig_spec_dict = { 'outfile': outfile, 'show': False, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components':{ "first_p": {"type": 'scatter', 'clabel': colorlabel}, "second_p": {"type": 'annotate', 'adtext': False} } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [0, 1]], plotcolor[::-1], [], []) plt.show()
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, sparse_mode, n_sparse, power, kpca_d, pc1, pc2, projectatomic, plotatomic, adjusttext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples. Plot the tags on the (k)PCA map. fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot' colorscol: The column number of the properties used for the coloring. Starts from 0. prefix: Filename prefix, default is ASAP output: The format for output files ([xyz], [matrix]). Default is xyz. peratom: Whether to output per atom pca coordinates (True/False) keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False) n_sparse: number of representative samples, default is 5% of the data power: use polynomial kernel function of degree n. kpca_d: Number of the principle components to keep pc1: Plot the projection along which principle axes pc2: Plot the projection along which principle axes projectatomic: build the projection using the (big) atomic descriptor matrix plotatomic: Plot the PCA coordinates of all atomic environments (True/False) adtext: Whether to adjust the texts (True/False) Returns ------- """ use_atomic_desc = (peratom or plotatomic or projectatomic) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print( "Did not provide the xyz file. We can only output descriptor matrix." ) output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') # sanity check if len(desc) == 0: raise ValueError( 'Please supply descriptor in a xyz file or a standlone descriptor matrix' ) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str")[:] ndict = len(tags) else: tags = [] # sparsification n_sample = len(desc) # set default value of n_sparse if n_sparse == 0: n_sparse = max(10, n_sample // 20) # sparsification if n_sparse >= n_sample: print( "the number of representative structure is too large, please select n < ", n_sample) elif n_sample > 0: if sparse_mode == 'fps' or sparse_mode == 'FPS': ifps, _ = fps(desc, n_sparse, 0) elif sparse_mode == 'cur' or sparse_mode == 'CUR': cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T) ifps, _ = CUR_deterministic(cov, n_sparse) else: raise ValueError('Cannot find the specified sparsification mode') else: print("Not using any sparsification") ifps = np.range(n_sample) k_spec = { 'k0': { "type": "cosine" } } #{ 'k1': {"type": "polynomial", "d": power}} k_transform = Descriptors_to_Kernels(k_spec) kNN = k_transform.compute(desc[ifps]) kMN = k_transform.compute(desc, desc[ifps]) print("Shape of the kNN matrix: ", np.shape(kNN), ", and shape of the kMN matrix:", np.shape(kMN)) # main thing kpca = KernelPCA(kpca_d) kpca.fit(kNN) proj = kpca.transform(kMN) if peratom or plotatomic and not projectatomic: kNT = np.power(np.dot(desc_atomic[:], desc[ifps].T), power) proj_atomic_all = kpca.transform(kNT) # save if output == 'matrix': np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') elif output == 'xyz': if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") asapxyz.set_descriptors(proj, 'kpca_coord') asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function( fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic) # make plot if plotatomic: outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png' else: outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '.png' fig_spec_dict = { 'outfile': outfile, 'show': False, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components': { "first_p": { "type": 'scatter', 'clabel': colorlabel }, "second_p": { "type": 'annotate', 'adtext': adjusttext } } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags) if peratom or plotatomic and not projectatomic: asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]], plotcolor_peratom[::-1], [], []) plt.show()