def add_bad_data (self, cnstfile, saefile, nnfdir, gpuid, sinet, P, T=0.8, V=0.2, M=0.06): atest = anitester(cnstfile, saefile, nnfdir, gpuid, sinet) # Declare data cache cachet = cg('_train', self.saef, self.storecac, True) cachev = cg('_valid', self.saef, self.storecac, True) Nidx = 0 Nbad = 0 Nadd = 0 for i, (X, E, S) in enumerate(zip(self.xyz, self.Eqm, self.spc)): if self.idx[i].size != 0: Nidx = Nidx + self.idx[i].size self.idx[i],m,diff = atest.test_for_bad(X,E,S,self.idx[i],M) Nbad = Nbad + self.idx[i].shape[0] self.idx[i], kat, Nt = self.store_random(cachet, X, E, S, self.idx[i], P, T) self.idx[i], kav, Nv = self.store_random(cachev, X, E, S, self.idx[i], P, V) self.kid[i] = np.array(np.concatenate([self.kid[i],kat]),dtype=np.int) self.ts = self.ts + Nt self.vs = self.vs + Nv self.nc[i] = self.nc[i] + Nt + Nv Nadd = Nadd + Nt + Nv # Add data to the cache #if idxt.shape[0] != 0: # cachet.insertdata(X[idxt], E[idxt], list(S)) #if idxv.shape[0] != 0: # cachev.insertdata(X[idxv], E[idxv], list(S)) print('\n--------Data health intformation---------') print(' -Full: ', self.tf, 'Percent of full used:',"{:.2f}".format(100.0*(self.ts+self.vs)/float(self.tf))+'%') print(' -Used: ', self.ts,':',self.vs, ':', self.ts+self.vs) print(' -Added:', Nadd,' bad:',Nbad,'of',Nidx) print('-----------------------------------------\n') self.Nbad = Nbad # Make meta data file for caches cachet.makemetadata() cachev.makemetadata()
def init_dataset(self, P, T=0.9, V=0.1): # Declare data cache cachet = cg('_train', self.saef, self.storecac, False) cachev = cg('_valid', self.saef, self.storecac, False) for i, (X, F, E, S) in enumerate(zip(self.xyz, self.frc, self.Eqm, self.spc)): N = E.shape[0] Tp = int(float(T) * float(P) * float(N)) Vp = int(float(V) * float(P) * float(N)) # Randomize index np.random.shuffle(self.idx[i]) # get indicies iix = np.random.uniform(0.0, 1.0, self.idx[i].size) tr_idx = np.asarray(np.where(iix < T * P))[0] vd_idx = np.asarray(np.where(iix >= 1.0 - (V * P)))[0] idxt = self.idx[i][tr_idx].copy() idxv = self.idx[i][vd_idx].copy() self.kid[i] = np.concatenate([self.kid[i], idxt, idxv]) self.nc[i] = self.nc[i] + idxt.shape[0] + idxv.shape[0] self.ts = self.ts + idxt.shape[0] self.vs = self.vs + idxv.shape[0] # Update index list self.idx[i] = self.idx[i][Tp + Vp + 1:] # Add data to the cache if idxt.shape[0] != 0: cachet.insertdata(X[idxt], F[idxt], E[idxt], list(S)) if idxv.shape[0] != 0: cachev.insertdata(X[idxv], F[idxv], E[idxv], list(S)) print('Full: ', self.tf) print('Used: ', self.ts, ':', self.vs, ':', self.ts + self.vs) # Make meta data file for caches cachet.makemetadata() cachev.makemetadata()
def init_dataset(self, P, T=0.8, V=0.2): # Declare data cache cachet = cg('_train', self.saef, self.storecac, False) cachev = cg('_valid', self.saef, self.storecac, False) for i,(X,E,S) in enumerate(zip(self.xyz,self.Eqm,self.spc)): N = E.shape[0] Tp = int(float(T)*float(P)*float(N)) Vp = int(float(V)*float(P)*float(N)) # Randomize index np.random.shuffle(self.idx[i]) # get indicies idxt = self.idx[i][0:Tp].copy() idxv = self.idx[i][Tp+1:Tp+Vp].copy() self.kid[i] = np.concatenate([self.kid[i], idxt]) self.nc[i] = self.nc[i] + idxt.shape[0] + idxv.shape[0] self.ts = self.ts + idxt.shape[0] self.vs = self.vs + idxv.shape[0] # Update index list self.idx[i] = self.idx[i][Tp+Vp+1:] # Add data to the cache if idxt.shape[0] != 0: cachet.insertdata(X[idxt], E[idxt], list(S)) if idxv.shape[0] != 0: cachev.insertdata(X[idxv], E[idxv], list(S)) print('Full: ', self.tf) print('Used: ', self.ts,':',self.vs, ':', self.ts+self.vs) # Make meta data file for caches cachet.makemetadata() cachev.makemetadata()
def add_bad_data(self, cnstfile, saefile, nnfdir, gpuid, sinet, P, T=0.9, V=0.1, M=0.3): atest = anitester(cnstfile, saefile, nnfdir, gpuid, sinet) # Declare data cache cachet = cg('_train', self.saef, self.storecac, True) cachev = cg('_valid', self.saef, self.storecac, True) Nbad = 0 Nadd = 0 Ngwd = 0 Ngto = 0 Nidx = 0 Nkid = 0 Ngid = 0 for i, (X, F, E, S) in enumerate(zip(self.xyz, self.frc, self.Eqm, self.spc)): if self.idx[i].size != 0: #print('Parent:', self.prt[i]) # Check if any "Good" milk went sour tmp_idx1, self.gid[i], mt, difft = atest.test_for_bad( X, E, S, self.gid[i], M) # Add the soured milk to the pot self.idx[i] = np.array(np.concatenate([tmp_idx1, self.idx[i]]), dtype=np.int32) # Test the pot for good and bad self.idx[i], god_idx, m, diff = atest.test_for_bad( X, E, S, self.idx[i], M) # Add good to good index self.gid[i] = np.array(np.concatenate([self.gid[i], god_idx]), dtype=np.int32) # Add to size of good, good went bad, and total bad Ngto = Ngto + self.gid[i].size Ngwd = Ngwd + tmp_idx1.size Nbad = Nbad + self.idx[i].size # Store a random subset of the bad for training self.idx[i], kat, Nt = self.store_random( cachet, X, F, E, S, self.idx[i], P, T) self.idx[i], kav, Nv = self.store_random( cachev, X, F, E, S, self.idx[i], P, V) #self.idx[i], kat, Nt = self.store_diverse(cachet, atest, X, F, E, S, self.idx[i], P, T) #self.idx[i], kav, Nv = self.store_diverse(cachev, atest, X, F, E, S, self.idx[i], P, V) # Add the training data to kid self.kid[i] = np.array(np.concatenate([self.kid[i], kat, kav]), dtype=np.int) # Count total in the pot Nidx = Nidx + self.idx[i].size Nkid = Nkid + self.kid[i].size Ngid = Ngid + self.gid[i].size # Increment training and validation size self.ts = self.ts + Nt self.vs = self.vs + Nv self.nc[i] = self.nc[i] + Nt + Nv Nadd = Nadd + Nt + Nv self.Nbad = Nbad output = '\n--------Data health intformation---------\n' +\ ' -Full: ' + str(self.tf) + ' Percent of full used: ' + "{:.2f}".format(100.0*(self.ts+self.vs)/float(self.tf)) + '%\n' +\ ' -Used: ' + str(self.ts) + ' : ' + str(self.vs) + ' : ' + str(self.ts+self.vs) + ' Ngwd: ' + str(Ngwd) + '\n' +\ ' -Skip: Ngwd: ' + str(Ngwd) + ' of ' + str(Ngto) + '\n' +\ ' -Size: ' + str(Nkid) + ' : ' + str(Nidx) + ' : ' + str(Ngid) + ' : ' + str(Nkid+Nidx+Ngid) + '\n' +\ ' -Added: ' + str(Nadd) + ' bad: ' +str(Nbad) + ' of ' + str(Nidx) + ' ('+"{:.1f}".format(self.get_percent_bad())+'%)' + '\n' +\ '-----------------------------------------\n\n' print(output) self.of.write(output) self.of.flush() # Make meta data file for caches cachet.makemetadata() cachev.makemetadata()
store_dir = wkdir + "cache-data-" N = 5 for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if os.path.exists(store_dir + str(i) + '/testset/testset.h5'): os.remove(store_dir + str(i) + '/testset/testset.h5') if not os.path.exists(store_dir + str(i) + '/testset'): os.mkdir(store_dir + str(i) + '/testset') cachet = [ cg('_train', saef, store_dir + str(r) + '/', forcet, chargt, False) for r in range(N) ] cachev = [ cg('_valid', saef, store_dir + str(r) + '/', forcet, chargt, False) for r in range(N) ] testh5 = [ pyt.datapacker(store_dir + str(r) + '/testset/testset.h5') for r in range(N) ] Nd = np.zeros(N, dtype=np.int32) Nbf = 0 for f, fn in enumerate(h5files): print('Processing file(' + str(f + 1) + ' of ' + str(len(h5files)) + '):',
hdf5file = '/home/jujuman/Research/ANI-DATASET/ani_data_c08e_gdb09aug.h5' storecac = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache09fsrc/' saef = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat" path = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache09fsrc/testset/c09fsrc-testset.h5" ''' hdf5file = '/home/jujuman/Research/ANI-DATASET/ani_data_c01test.h5' storecac = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache01_2/' saef = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat" path = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache01_2/testset/c01-testset.h5" ''' # Construct the data loader class adl = pya.anidataloader(hdf5file) # Declare data cache cachet = cg('_train', saef, storecac) cachev = cg('_valid', saef, storecac) # Declare test cache dpack = pyt.datapacker(path) # Load morse parameters popt = np.load('mp_ani_params_test.npz')['param'] # Loop over data in set for data in adl.getnextdata(): loc = data['parent'] + "/" + data['child'] print(loc) xyz = data['coordinates'] eng = data['energies']
store_dir = "/home/jujuman/Research/QM-7TEST/tester/" saef = "/home/jujuman/Research/QM-7TEST/tester/sae_6-31gd.dat" data_index = np.array(range(eng.shape[0])) #unc_file = '/home/jujuman/dataset-qm9/uncharacterized.txt' #data_index = remove_bad_data(unc_file, data_index) np.random.shuffle(data_index) print(data_index.shape) listt = data_index[:int(0.8*len(data_index))] listv = data_index[int(0.8*len(data_index)):int(0.9*len(data_index))] listte = data_index[int(0.9*len(data_index)):] cachet = cg('_train', saef, store_dir) cachev = cg('_valid', saef, store_dir) eng = eng / hdn.hatokcal print('max: ', eng.max(), ' min: ', eng.min()) for n,i in enumerate(listt): print(n) x = xyz[i] e = eng[i] z = spc[i] #z = atn[i] z = z[~((z == 0))] Na = z.shape[0]
def build_strided_training_cache(self, Nblocks, Nvalid, Ntest, build_test=True, build_valid=False, forces=True, grad=False, Fkey='forces', forces_unit=1.0, Ekey='energies', energy_unit=1.0, Eax0sum=False, rmhighe=True): if not os.path.isfile(self.netdict['saefile']): self.sae_linear_fitting(Ekey=Ekey, energy_unit=energy_unit, Eax0sum=Eax0sum) h5d = self.h5dir store_dir = self.train_root + "cache-data-" N = self.Nn Ntrain = Nblocks - Nvalid - Ntest if Nblocks % N != 0: raise ValueError( 'Error: number of networks must evenly divide number of blocks.' ) Nstride = Nblocks / N for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if build_test: if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] if build_test: testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] if build_valid: valdh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/valdset' + str(r) + '.h5') for r in range(N) ] if rmhighe: dE = [] for f in self.h5file: adl = pyt.anidataloader(h5d + f) for data in adl: S = data['species'] E = data['energies'] X = data['coordinates'] Esae = hdt.compute_sae(self.netdict['saefile'], S) dE.append((E - Esae) / np.sqrt(len(S))) dE = np.concatenate(dE) cidx = np.where(np.abs(dE) < 15.0) std = np.abs(dE[cidx]).std() men = np.mean(dE[cidx]) print(men, std, men + std) idx = np.intersect1d( np.where(dE >= -np.abs(15 * std + men))[0], np.where(dE <= np.abs(11 * std + men))[0]) cnt = idx.size print('DATADIST: ', dE.size, cnt, (dE.size - cnt), 100.0 * ((dE.size - cnt) / dE.size)) E = [] data_count = np.zeros((N, 3), dtype=np.int32) for f in self.h5file: print('Reading data file:', h5d + f) adl = pyt.anidataloader(h5d + f) for data in adl: #print(data['path'],data['energies'].size) S = data['species'] if data[Ekey].size > 0 and (set(S).issubset( self.netdict['atomtyp'])): X = np.array(data['coordinates'], order='C', dtype=np.float32) #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape) if Eax0sum: E = energy_unit * np.sum(np.array( data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit * np.array( data[Ekey], order='C', dtype=np.float64) if forces and not grad: F = forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) elif forces and grad: F = -forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) else: F = 0.0 * X if rmhighe: Esae = hdt.compute_sae(self.netdict['saefile'], S) ind_dE = (E - Esae) / np.sqrt(len(S)) hidx = np.union1d( np.where(ind_dE < -(15.0 * std + men))[0], np.where(ind_dE > (11.0 * std + men))[0]) lidx = np.intersect1d( np.where(ind_dE >= -(15.0 * std + men))[0], np.where(ind_dE <= (11.0 * std + men))[0]) if hidx.size > 0: print( ' -(' + f + ':' + data['path'] + ')High energies detected:\n ', (E[hidx] - Esae) / np.sqrt(len(S))) X = X[lidx] E = E[lidx] F = F[lidx] # Build random split index ridx = np.random.randint(0, Nblocks, size=E.size) Didx = [ np.argsort(ridx)[np.where(ridx == i)] for i in range(Nblocks) ] # Build training cache for nid, cache in enumerate(cachet): set_idx = np.concatenate([ Didx[((bid + nid * int(Nstride)) % Nblocks)] for bid in range(Ntrain) ]) if set_idx.size != 0: data_count[nid, 0] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) # for nid,cache in enumerate(cachev): # set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)]) # if set_idx.size != 0: # data_count[nid,0]+=set_idx.size # cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) for nid, cache in enumerate(cachev): set_idx = np.concatenate([ Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks] for bid in range(Nvalid) ]) if set_idx.size != 0: data_count[nid, 1] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) if build_valid: valdh5[nid].store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) if build_test: for nid, th5 in enumerate(testh5): set_idx = np.concatenate([ Didx[(Ntrain + Nvalid + bid + nid * int(Nstride)) % Nblocks] for bid in range(Ntest) ]) if set_idx.size != 0: data_count[nid, 2] += set_idx.size th5.store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) # Save train and valid meta file and cleanup testh5 for t, v in zip(cachet, cachev): t.makemetadata() v.makemetadata() if build_test: for th in testh5: th.cleanup() if build_valid: for vh in valdh5: vh.cleanup() print(' Train ', ' Valid ', ' Test ') print(data_count) print('Training set built.')
def build_training_cache(self, forces=True): store_dir = self.train_root + "cache-data-" N = self.Nn for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] Nd = np.zeros(N, dtype=np.int32) Nbf = 0 for f, fn in enumerate(self.h5file): print( 'Processing file(' + str(f + 1) + ' of ' + str(len(self.h5file)) + '):', fn) adl = pyt.anidataloader(self.h5dir + fn) To = adl.size() Ndc = 0 Fmt = [] Emt = [] for c, data in enumerate(adl): Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill( 6) # Extract the data X = data['coordinates'] E = data['energies'] S = data['species'] # 0.0 forces if key doesnt exist if forces: F = data['forces'] else: F = 0.0 * X Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1)) Emt.append(E) Mv = np.max(np.linalg.norm(F, axis=2), axis=1) index = np.where(Mv > 10.5)[0] indexk = np.where(Mv <= 10.5)[0] Nbf += index.size # Clear forces X = X[indexk] F = F[indexk] E = E[indexk] Esae = hdt.compute_sae(self.netdict['saefile'], S) hidx = np.where(np.abs(E - Esae) > 10.0) lidx = np.where(np.abs(E - Esae) <= 10.0) if hidx[0].size > 0: print( ' -(' + str(c).zfill(3) + ')High energies detected:\n ', E[hidx]) X = X[lidx] E = E[lidx] F = F[lidx] Ndc += E.size if (set(S).issubset(self.netdict['atomtyp'])): # Random mask R = np.random.uniform(0.0, 1.0, E.shape[0]) idx = np.array([interval(r, N) for r in R]) # Build random split lists split = [] for j in range(N): split.append([i for i, s in enumerate(idx) if s == j]) nd = len([i for i, s in enumerate(idx) if s == j]) Nd[j] = Nd[j] + nd # Store data for i, t, v, te in zip(range(N), cachet, cachev, testh5): ## Store training data X_t = np.array(np.concatenate( [X[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) F_t = np.array(np.concatenate( [F[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) E_t = np.array(np.concatenate( [E[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float64) if E_t.shape[0] != 0: t.insertdata(X_t, F_t, E_t, list(S)) ## Store Validation if np.array(split[i]).size > 0: X_v = np.array(X[split[i]], order='C', dtype=np.float32) F_v = np.array(F[split[i]], order='C', dtype=np.float32) E_v = np.array(E[split[i]], order='C', dtype=np.float64) if E_v.shape[0] != 0: v.insertdata(X_v, F_v, E_v, list(S)) # Print some stats print('Data count:', Nd) print('Data split:', 100.0 * Nd / np.sum(Nd), '%') # Save train and valid meta file and cleanup testh5 for t, v, th in zip(cachet, cachev, testh5): t.makemetadata() v.makemetadata() th.cleanup()
#wkdir + "/h5data/ani-gdb-c08e.h5", ] store_dir = wkdir + "/cache-c08e-" #adl.split_load(10) N = 10 train_idx = [[2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7]] valid_idx = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]] cachet = [ cg('_train', saef, store_dir + str(r) + '/', False) for r in range(5) ] cachev = [ cg('_valid', saef, store_dir + str(r) + '/', False) for r in range(5) ] for fn in h5files: adl = pyt.anidataloader(fn) for c, data in enumerate(adl): # Print file print('Processing file: ', c) # Extract the data xyz = data['coordinates'] erg = data['energies']
store_dir = wkdir + "cache-data-" N = 5 for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if os.path.exists(store_dir + str(i) + '/../testset/testset'+str(i)+'.h5'): os.remove(store_dir + str(i) + '/../testset/testset'+str(i)+'.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [cg('_train', saef, store_dir + str(r) + '/',False) for r in range(N)] cachev = [cg('_valid', saef, store_dir + str(r) + '/',False) for r in range(N)] testh5 = [pyt.datapacker(store_dir + str(r) + '/../testset/testset'+str(r)+'.h5') for r in range(N)] Nd = np.zeros(N,dtype=np.int32) Nbf = 0 for f,fn in enumerate(h5files): print('Processing file('+ str(f+1) +' of '+ str(len(h5files)) +'):', fn) adl = pyt.anidataloader(fn) To = adl.size() Ndc = 0 Fmt = [] Emt = [] for c, data in enumerate(adl): #if c == 2 or c == 2 or c == 2:
def build_strided_training_cache(self, Nblocks, Nvalid, Ntest, build_test=True, forces=True, grad=False, Fkey='forces', forces_unit=1.0, Ekey='energies', energy_unit=1.0, Eax0sum=False): if not os.path.isfile(self.netdict['saefile']): self.sae_linear_fitting(Ekey=Ekey, energy_unit=energy_unit, Eax0sum=Eax0sum) h5d = self.h5dir store_dir = self.train_root + "cache-data-" N = self.Nn Ntrain = Nblocks - Nvalid - Ntest if Nblocks % N != 0: raise ValueError( 'Error: number of networks must evenly divide number of blocks.' ) Nstride = Nblocks / N for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if build_test: if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] if build_test: testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] E = [] data_count = np.zeros((N, 3), dtype=np.int32) for f in self.h5file: adl = pyt.anidataloader(h5d + f) for data in adl: #print(data['path'],data['energies'].size) S = data['species'] if data[Ekey].size > 0 and (set(S).issubset( self.netdict['atomtyp'])): X = np.array(data['coordinates'], order='C', dtype=np.float32) if Eax0sum: E = energy_unit * np.sum(np.array( data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit * np.array( data[Ekey], order='C', dtype=np.float64) if forces and not grad: F = forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) if forces and grad: F = -forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) else: F = 0.0 * X # Build random split index ridx = np.random.randint(0, Nblocks, size=E.size) Didx = [ np.argsort(ridx)[np.where(ridx == i)] for i in range(Nblocks) ] # Build training cache for nid, cache in enumerate(cachet): set_idx = np.concatenate([ Didx[((bid + nid * int(Nstride)) % Nblocks)] for bid in range(Ntrain) ]) if set_idx.size != 0: data_count[nid, 0] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) for nid, cache in enumerate(cachev): set_idx = np.concatenate([ Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks] for bid in range(Nvalid) ]) if set_idx.size != 0: data_count[nid, 1] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) if build_test: for nid, th5 in enumerate(testh5): set_idx = np.concatenate([ Didx[(Ntrain + Nvalid + bid + nid * int(Nstride)) % Nblocks] for bid in range(Ntest) ]) if set_idx.size != 0: data_count[nid, 2] += set_idx.size th5.store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) # Save train and valid meta file and cleanup testh5 for t, v in zip(cachet, cachev): t.makemetadata() v.makemetadata() if build_test: for th in testh5: th.cleanup() print(' Train ', ' Valid ', ' Test ') print(data_count) print('Training set built.')
adl = pyt.anidataloader(h5file) adl.split_load(10) train_idx = [[2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7]] valid_idx = [[0], [2], [4], [6], [8]] r = 0 for t, v in zip(train_idx, valid_idx): print("Working on index: ", r) cachet = cg('_train', saef, store_dir + str(r) + '/') cachev = cg('_valid', saef, store_dir + str(r) + '/') for i in range(0, adl.size()): print("Working on : ", i, ' from set ', r) t_data = adl.getdata(i, t) v_data = adl.getdata(i, v) #cn = 0 #for x,y in zip(v_data[0],v_data[1]): # print('Element ',cn,': ',x,'\n',y) #print(t_data[0].shape, ' : ', t_data[1].shape, ' : ', t_data[2].shape) #print(v_data[0].shape, ' : ', v_data[1].shape, ' : ', v_data[2].shape) if t_data[0].shape[0] != t_data[1].shape[0]: