예제 #1
0
 def __init__(self, datafile):
     self.fdata = dict()
     for df in datafile:
         adl = ant.anidataloader(df)
         tdata = dict()
         for data in adl:
             tdata.update({data['path'].split('/')[-1]: data})
         adl.cleanup()
         self.fdata[df.split('tsdata_')[-1].split('.h5')[0]] = tdata
예제 #2
0
    def sae_linear_fitting(self, Ekey='energies', energy_unit=1.0, Eax0sum=False):
        from sklearn import linear_model
        print('Performing linear fitting...')

        datadir = self.h5dir
        sae_out = self.netdict['saefile']

        smap = dict()
        for i,Z in enumerate(self.netdict['atomtyp']):
            smap.update({Z:i})

        Na = len(smap)
        files = os.listdir(datadir)

        X = []
        y = []
        for f in files[0:20]:
            print(f)
            adl = pyt.anidataloader(datadir + f)
            for data in adl:
                # print(data['path'])
                S = data['species']

                if data[Ekey].size > 0:
                    if Eax0sum:
                        E = energy_unit*np.sum(np.array(data[Ekey], order='C', dtype=np.float64), axis=1)
                    else:
                        E = energy_unit*np.array(data[Ekey], order='C', dtype=np.float64)

                    S = S[0:data['coordinates'].shape[1]]
                    unique, counts = np.unique(S, return_counts=True)
                    x = np.zeros(Na, dtype=np.float64)
                    for u, c in zip(unique, counts):
                        x[smap[u]] = c

                    for e in E:
                        X.append(np.array(x))
                        y.append(np.array(e))

        X = np.array(X)
        y = np.array(y).reshape(-1, 1)

        lin = linear_model.LinearRegression(fit_intercept=False)
        lin.fit(X, y)

        coef = lin.coef_
        print(coef)

        sae = open(sae_out, 'w')
        for i, c in enumerate(coef[0]):
            sae.write(next(key for key, value in smap.items() if value == i) + ',' + str(i) + '=' + str(c) + '\n')

        sae.close()

        print('Linear fitting complete.')
예제 #3
0
    def compute_test (self, h5file):
        mNa = 100

        # Declare loader
        adl = pyt.anidataloader(h5file)

        # Declare containers
        Eact = []
        Ecmp = []
        Nmt = 0

        for data in adl:
            # Extract the data
            xyz = data['coordinates']
            Eqm = data['energies']
            spc = data['species']

            xyz = xyz.reshape(Eqm.shape[0], len(spc), 3)

            if xyz.shape[0] > 0:
                Nm = xyz.shape[0]
                Na = xyz.shape[1]

                if Na < mNa:
                    mNa = Na

                Nat = Na * Nm

                Nit = int(np.ceil(Nat / 65000.0))
                Nmo = int(65000 / Na)
                Nmx = Nm

                for j in range(0, Nit):
                    # Setup idicies
                    i1 = j * Nmo
                    i2 = min(j * Nmo + Nmo, Nm)

                    # copy array subset
                    Eact_t = Eqm[i1:i2]

                    # Set the conformers in NeuroChem
                    self.nc.setConformers(confs=xyz[i1:i2], types=list(spc))

                    Ecmp_t = self.nc.energy()

                    Ecmp.append(np.sum(np.power(hdn.hatokcal * Ecmp_t - hdn.hatokcal * Eact_t,2)))
                    Nmt = Nmt + Ecmp_t.size
                    #Eact.append(Eact_t)

                    #print(hdn.hatokcal * np.sum(np.abs(Ecmp_t-Eact_t))/float(Ecmp_t.size))

        Ecmp = np.array(Ecmp, dtype=np.float64)

        return np.sqrt(np.sum(Ecmp) / float(Nmt))
예제 #4
0

def check_for_outsider(okayl, chckl):
    for i in chckl:
        if i not in okayl:
            return False
    return True


dst = "/home/jujuman/Research/ANI-DATASET/h5data/gdb9-2500-bad_new.h5"
src = "/home/jujuman/Research/ANI-DATASET/GDB-09-Data/gdb9-2500-bad.h5"

#open an HDF5 for compressed storage.
#Note that if the path exists, it will open whatever is there.
dpack = pyt.datapacker(dst)
aload = pyt.anidataloader(src)

at = [
    'H',
    'C',
    'N',
    'O',
    #'F',
    #'S',
]

for id, data in enumerate(aload.get_roman_data()):

    xyz = np.asarray(data['coordinates'], dtype=np.float32)
    erg = np.asarray(data['energies'], dtype=np.float64)
    spc = [str(a.decode('ascii')) for a in data['species']]
예제 #5
0
# Import pyanitools
import  pyanitools as pyt

# path the the store file
store_file = '/home/jujuman/Research/ANI-DATASET/rxn_db_mig.h5'

# Declare the loader, opens the store
loader = pyt.anidataloader(store_file)

# Load the entire store into memory
loader.totalload()

# Loop over store data
for i in range(loader.size()):
    data = loader.getdata(i)
    print(data[0])

# Closes the store file
loader.cleanup()
path = "/home/jujuman/Research/ANI-DATASET/ANI-1_release/data/ani-1_data_c08.h5"

wkdir = '/home/jujuman/Research/CrossValidation/'
cnstfile = wkdir + 'rHCNO-4.6A_16-3.1A_a4-8.params'
saefile = wkdir + 'sae_6-31gd.dat'

#-------------------------------------------
# Build networks
nc = [
    pync.conformers(cnstfile, saefile,
                    wkdir + 'cv_c08e_ntw_' + str(l) + '/networks/', 0)
    for l in range(5)
]

# Build loader
adl = pyt.anidataloader(path)

# Load data
adl.load_node("/gdb11_s01/")

# Loop
for i in range(adl.size()):
    #print(i, ' of ', adl.size())
    data = adl.getdata(i)

    x = data[0]
    e = data[1]
    s = data[2]

    Nm = e.shape[0]
    Na = len(s)
예제 #7
0
    def __init__(self, hdf5files, saef, output, storecac, storetest, Naev):
        self.xyz = []
        self.frc = []
        self.Eqm = []
        self.spc = []
        self.idx = []
        self.gid = []
        self.prt = []

        self.Naev = Naev

        self.kid = []  # list to track data kept

        self.nt = []  # total conformers
        self.nc = []  # total kept

        self.of = open(output, 'w')

        self.tf = 0

        for f in hdf5files:
            # Construct the data loader class
            adl = pyt.anidataloader(f)
            print('Loading file:', f)

            # Declare test cache
            if os.path.exists(storetest):
                os.remove(storetest)

            dpack = pyt.datapacker(storetest)

            for i, data in enumerate(adl):

                xyz = data['coordinates']
                frc = data['forces']
                eng = data['energies']
                spc = data['species']
                nme = data['path']

                # Toss out high forces
                Mv = np.max(np.linalg.norm(frc, axis=2), axis=1)
                index = np.where(Mv > 1.75)[0]
                indexk = np.where(Mv <= 1.75)[0]

                # CLear forces
                xyz = xyz[indexk]
                frc = frc[indexk]
                eng = eng[indexk]

                idx = np.random.uniform(0.0, 1.0, eng.size)
                tr_idx = np.asarray(np.where(idx < 0.99))[0]
                te_idx = np.asarray(np.where(idx >= 0.99))[0]

                #print(tr_idx)
                if tr_idx.size > 0:
                    self.prt.append(nme)

                    self.xyz.append(
                        np.ndarray.astype(xyz[tr_idx], dtype=np.float32))
                    self.frc.append(
                        np.ndarray.astype(frc[tr_idx], dtype=np.float32))
                    self.Eqm.append(
                        np.ndarray.astype(eng[tr_idx], dtype=np.float64))
                    self.spc.append(spc)

                    Nd = eng[tr_idx].size
                    #print(Nd)

                    self.idx.append(np.arange(Nd))
                    self.kid.append(np.array([], dtype=np.int))
                    self.gid.append(np.array([], dtype=np.int))

                    self.tf = self.tf + Nd

                    self.nt.append(Nd)
                    self.nc.append(0)

                # Prepare and store the test data set
                if xyz[te_idx].size != 0:
                    #t_xyz = xyz[te_idx].reshape(te_idx.size, xyz[te_idx].shape[1] * xyz[te_idx].shape[2])
                    dpack.store_data(nme + '/mol' + str(i),
                                     coordinates=xyz[te_idx],
                                     forces=frc[te_idx],
                                     energies=np.array(eng[te_idx]),
                                     species=spc)

            # Clean up
            adl.cleanup()

            # Clean up
            dpack.cleanup()

        self.nt = np.array(self.nt)
        self.nc = np.array(self.nc)

        self.ts = 0
        self.vs = 0

        self.Nbad = self.tf

        self.saef = saef
        self.storecac = storecac
]
cachev = [
    cg('_valid', saef, store_dir + str(r) + '/', forcet, chargt, False)
    for r in range(N)
]
testh5 = [
    pyt.datapacker(store_dir + str(r) + '/testset/testset.h5')
    for r in range(N)
]

Nd = np.zeros(N, dtype=np.int32)
Nbf = 0
for f, fn in enumerate(h5files):
    print('Processing file(' + str(f + 1) + ' of ' + str(len(h5files)) + '):',
          fn)
    adl = pyt.anidataloader(fn)

    To = adl.size()
    Ndc = 0
    Fmt = []
    Emt = []
    for c, data in enumerate(adl):
        if True:

            # Get test store name
            Pn = fn.split('/')[-1].rsplit('.', 1)[0] + data['path']

            # Progress indicator
            sys.stdout.write("\r%d%% %s" % (int(100 * c / float(To)), Pn))
            sys.stdout.flush()
    ax.set_ylim([shr1, shr2])

    font = {'family': 'Bitstream Vera Sans', 'weight': 'heavy', 'size': 24}

    ax.set_ylabel('$E_{cmp}$', fontdict=font)
    ax.set_xlabel('$E_{ref}$', fontdict=font)


# Set data fields
#h5file = '/home/jujuman/Research/SingleNetworkTest/cache02/testset/testset.h5'
h5file = '/home/jujuman/Research/DataReductionMethods/models/cache/testset/testset.h5'

#h5file = '/home/jujuman/Research/ANI-DATASET/h5data/ani-gdb-c03.h5'

# Declare loader
adl = pyt.anidataloader(h5file)

nl = adl.get_group_list()
print(nl)

#node = "gdb11_s10"

#Network 1 Files
#wkdir = '/home/jujuman/Scratch/Dropbox/ChemSciencePaper.AER/networks/ANI-SN_CHNOSF-1/'
wkdir = '/home/jujuman/Research/DataReductionMethods/models/train_c08f/'
#wkdir = '/home/jujuman/Dropbox/ChemSciencePaper.AER/networks/ANI-c08f-ntwk/'
#wkdir = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/train_08_9/'
#wkdir = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/train_01/'

cnstfile = wkdir + 'rHCNO-4.6A_16-3.1A_a4-8.params'
saefile = wkdir + 'sae_6-31gd.dat'
예제 #10
0
    def build_strided_training_cache(self,
                                     Nblocks,
                                     Nvalid,
                                     Ntest,
                                     build_test=True,
                                     build_valid=False,
                                     forces=True,
                                     grad=False,
                                     Fkey='forces',
                                     forces_unit=1.0,
                                     Ekey='energies',
                                     energy_unit=1.0,
                                     Eax0sum=False,
                                     rmhighe=True):
        if not os.path.isfile(self.netdict['saefile']):
            self.sae_linear_fitting(Ekey=Ekey,
                                    energy_unit=energy_unit,
                                    Eax0sum=Eax0sum)
        h5d = self.h5dir
        store_dir = self.train_root + "cache-data-"
        N = self.Nn
        Ntrain = Nblocks - Nvalid - Ntest
        if Nblocks % N != 0:
            raise ValueError(
                'Error: number of networks must evenly divide number of blocks.'
            )
        Nstride = Nblocks / N
        for i in range(N):
            if not os.path.exists(store_dir + str(i)):
                os.mkdir(store_dir + str(i))
            if build_test:
                if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                                  str(i) + '.h5'):
                    os.remove(store_dir + str(i) + '/../testset/testset' +
                              str(i) + '.h5')
                if not os.path.exists(store_dir + str(i) + '/../testset'):
                    os.mkdir(store_dir + str(i) + '/../testset')
        cachet = [
            cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]
        cachev = [
            cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]

        if build_test:
            testh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if build_valid:
            valdh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/valdset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if rmhighe:
            dE = []
            for f in self.h5file:
                adl = pyt.anidataloader(h5d + f)
                for data in adl:
                    S = data['species']
                    E = data['energies']
                    X = data['coordinates']

                    Esae = hdt.compute_sae(self.netdict['saefile'], S)

                    dE.append((E - Esae) / np.sqrt(len(S)))

            dE = np.concatenate(dE)
            cidx = np.where(np.abs(dE) < 15.0)
            std = np.abs(dE[cidx]).std()
            men = np.mean(dE[cidx])

            print(men, std, men + std)
            idx = np.intersect1d(
                np.where(dE >= -np.abs(15 * std + men))[0],
                np.where(dE <= np.abs(11 * std + men))[0])
            cnt = idx.size
            print('DATADIST: ', dE.size, cnt, (dE.size - cnt),
                  100.0 * ((dE.size - cnt) / dE.size))

        E = []
        data_count = np.zeros((N, 3), dtype=np.int32)
        for f in self.h5file:
            print('Reading data file:', h5d + f)
            adl = pyt.anidataloader(h5d + f)
            for data in adl:
                #print(data['path'],data['energies'].size)

                S = data['species']

                if data[Ekey].size > 0 and (set(S).issubset(
                        self.netdict['atomtyp'])):

                    X = np.array(data['coordinates'],
                                 order='C',
                                 dtype=np.float32)

                    #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape)

                    if Eax0sum:
                        E = energy_unit * np.sum(np.array(
                            data[Ekey], order='C', dtype=np.float64),
                                                 axis=1)
                    else:
                        E = energy_unit * np.array(
                            data[Ekey], order='C', dtype=np.float64)

                    if forces and not grad:
                        F = forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    elif forces and grad:
                        F = -forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    else:
                        F = 0.0 * X

                    if rmhighe:
                        Esae = hdt.compute_sae(self.netdict['saefile'], S)

                        ind_dE = (E - Esae) / np.sqrt(len(S))

                        hidx = np.union1d(
                            np.where(ind_dE < -(15.0 * std + men))[0],
                            np.where(ind_dE > (11.0 * std + men))[0])
                        lidx = np.intersect1d(
                            np.where(ind_dE >= -(15.0 * std + men))[0],
                            np.where(ind_dE <= (11.0 * std + men))[0])

                        if hidx.size > 0:
                            print(
                                '  -(' + f + ':' + data['path'] +
                                ')High energies detected:\n    ',
                                (E[hidx] - Esae) / np.sqrt(len(S)))

                        X = X[lidx]
                        E = E[lidx]
                        F = F[lidx]

                    # Build random split index
                    ridx = np.random.randint(0, Nblocks, size=E.size)
                    Didx = [
                        np.argsort(ridx)[np.where(ridx == i)]
                        for i in range(Nblocks)
                    ]

                    # Build training cache
                    for nid, cache in enumerate(cachet):
                        set_idx = np.concatenate([
                            Didx[((bid + nid * int(Nstride)) % Nblocks)]
                            for bid in range(Ntrain)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 0] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    # for nid,cache in enumerate(cachev):
                    #     set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)])
                    #     if set_idx.size != 0:
                    #         data_count[nid,0]+=set_idx.size
                    #         cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S))

                    for nid, cache in enumerate(cachev):
                        set_idx = np.concatenate([
                            Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks]
                            for bid in range(Nvalid)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 1] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))
                            if build_valid:
                                valdh5[nid].store_data(f + data['path'],
                                                       coordinates=X[set_idx],
                                                       forces=F[set_idx],
                                                       energies=E[set_idx],
                                                       species=list(S))

                    if build_test:
                        for nid, th5 in enumerate(testh5):
                            set_idx = np.concatenate([
                                Didx[(Ntrain + Nvalid + bid +
                                      nid * int(Nstride)) % Nblocks]
                                for bid in range(Ntest)
                            ])
                            if set_idx.size != 0:
                                data_count[nid, 2] += set_idx.size
                                th5.store_data(f + data['path'],
                                               coordinates=X[set_idx],
                                               forces=F[set_idx],
                                               energies=E[set_idx],
                                               species=list(S))

        # Save train and valid meta file and cleanup testh5
        for t, v in zip(cachet, cachev):
            t.makemetadata()
            v.makemetadata()

        if build_test:
            for th in testh5:
                th.cleanup()

        if build_valid:
            for vh in valdh5:
                vh.cleanup()

        print(' Train ', ' Valid ', ' Test ')
        print(data_count)
        print('Training set built.')
예제 #11
0
    def __init__(self, hdf5files, saef, storecac, storetest):
        self.xyz = []
        self.Eqm = []
        self.spc = []
        self.idx = []
        self.prt = []

        self.kid = [] # list to track data kept

        self.nt = [] # total conformers
        self.nc = [] # total kept

        self.tf = 0

        for f in hdf5files:
            # Construct the data loader class
            adl = pyt.anidataloader(f)

            # Declare test cache
            if os.path.exists(storetest):
                os.remove(storetest)

            dpack = pyt.datapacker(storetest)

            for i, data in enumerate(adl):
                xyz = np.array_split(data['coordinates'], 10)
                eng = np.array_split(data['energies'], 10)
                spc = data['species']
                nme = data['parent']

                self.prt.append(nme)

                self.xyz.append( np.concatenate(xyz[0:9]) )
                self.Eqm.append( np.concatenate(eng[0:9]) )
                self.spc.append(spc)

                Nd = np.concatenate(eng[0:9]).shape[0]

                self.idx.append( np.arange(Nd) )
                self.kid.append( np.array([], dtype=np.int) )

                self.tf = self.tf + Nd

                self.nt.append(Nd)
                self.nc.append(0)

                # Prepare and store the test data set
                if xyz[9].size != 0:
                    t_xyz = xyz[9].reshape(xyz[9].shape[0], xyz[9].shape[1] * xyz[9].shape[2])
                    dpack.store_data(nme + '/mol' + str(i), coordinates=t_xyz, energies=np.array(eng[9]), species=spc)

            # Clean up
            adl.cleanup()

            # Clean up
            dpack.cleanup()

        self.nt = np.array(self.nt)
        self.nc = np.array(self.nc)

        self.ts = 0
        self.vs = 0

        self.Nbad = self.tf

        self.saef = saef
        self.storecac = storecac
예제 #12
0
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0002.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0003.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0004.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0005.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0006.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0007.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0008.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0009.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0010.h5',
    '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0011.h5',
]

r = re.compile('(.+?)(\d+?)')
comb = dict()
for h5 in h5_list:
    adl = pyt.anidataloader(h5)
    for data in adl:
        key = data['path'].split('_')[1].split('/')[0]
        items = r.findall(key)
        #print(key, items, sorted(items))
        #print(data.keys())
        if key in comb:
            comb[key].append(data)
        else:
            comb[key] = [data]

print(len(list(comb.keys())))
for k in comb.keys():
    data = comb[k]
    data_new = dict()
    data_new['energies'] = data
예제 #13
0
import numpy as np
import hdnntools as gt
import pyanitools as pyt
import os

lfile = '/home/jujuman/DataTesting/gdb9-2500-div-dim.h5'
sfile = '/home/jujuman/DataTesting/gdb9-2500-div-dim_35.h5'

if os.path.exists(sfile):
    os.remove(sfile)

adl = pyt.anidataloader(lfile)
dpk = pyt.datapacker(sfile)

for i,x in enumerate(adl):
    print(i)
    xyz = np.asarray(x['coordinates'],dtype=np.float32)
    erg = x['energies']
    spc = x['species']

    dpk.store_data('/gdb-09-DIV/mol'+str(i), coordinates=xyz.reshape(erg.shape[0],len(spc)*3), energies=erg, species=spc)

adl.cleanup()
dpk.cleanup()
예제 #14
0

def MAE(act, pre):
    N = act.shape[0]
    e = (np.abs(pre - act)).sum()
    return e / float(N)


# Set required files for pyNeuroChem
anipath = '/home/jujuman/Research/QM-7TEST/tester/ANI-QM7-ntwk'
cnstfile = anipath + '/rHCNOS-5.0A_16-3.1A_a4-8.params'
saefile = anipath + '/../sae_6-31gd.dat'
nnfdir = anipath + '/networks/'

path = "/home/jujuman/Scratch/Research/QM-7TEST/QM7-test-ho.h5"
datas = pyt.anidataloader(path)
datas.totalload()

# Construct pyNeuroChem class
nc = pync.conformers(cnstfile, saefile, nnfdir, 0)

Ea = np.zeros(datas.size())
Ec = np.zeros(datas.size())

for i in range(datas.size()):

    print(i, ' of ', datas.size())
    data = datas.getdata(i)

    x = data[0]
    e = data[1]
예제 #15
0
model = Net(dims).to('cuda')
model.load_state_dict(torch.load(args.model))
optimizer = optim.SGD(model.parameters(), lr=args.lr)    
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)


# load the data into memory (big)


mcnt = 0 #molecules
ccnt = 0 #conformers
elements = set()
examples = [] #the entire training set loaded into memory
examplesbysize = dict()
for hd5file in sorted(glob.glob('*.h5')):
    for data in pya.anidataloader(hd5file):
        #calculate some statistics
        mcnt += 1
        ccnt += len(data['energies'])
        elements.update(data['species'])
        
        #molecule types and radii
        types = np.array([typemap[elem] for elem in data['species']], dtype=np.float32)
        radii = np.array([typeradii[int(index)] for index in types], dtype=np.float32)

        sz = len(radii)
        if sz not in examplesbysize:
            examplesbysize[sz] = []
        #create an example for every conformer
        for coord, energy in zip(data['coordinates'],data['energies']):
            c = molgrid.CoordinateSet(coord.astype(np.float32), types, radii,4)
예제 #16
0
 def build_training_cache(self, forces=True):
     store_dir = self.train_root + "cache-data-"
     N = self.Nn
     for i in range(N):
         if not os.path.exists(store_dir + str(i)):
             os.mkdir(store_dir + str(i))
         if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                           str(i) + '.h5'):
             os.remove(store_dir + str(i) + '/../testset/testset' + str(i) +
                       '.h5')
         if not os.path.exists(store_dir + str(i) + '/../testset'):
             os.mkdir(store_dir + str(i) + '/../testset')
     cachet = [
         cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     cachev = [
         cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     testh5 = [
         pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                        str(r) + '.h5') for r in range(N)
     ]
     Nd = np.zeros(N, dtype=np.int32)
     Nbf = 0
     for f, fn in enumerate(self.h5file):
         print(
             'Processing file(' + str(f + 1) + ' of ' +
             str(len(self.h5file)) + '):', fn)
         adl = pyt.anidataloader(self.h5dir + fn)
         To = adl.size()
         Ndc = 0
         Fmt = []
         Emt = []
         for c, data in enumerate(adl):
             Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill(
                 6)
             # Extract the data
             X = data['coordinates']
             E = data['energies']
             S = data['species']
             # 0.0 forces if key doesnt exist
             if forces:
                 F = data['forces']
             else:
                 F = 0.0 * X
             Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1))
             Emt.append(E)
             Mv = np.max(np.linalg.norm(F, axis=2), axis=1)
             index = np.where(Mv > 10.5)[0]
             indexk = np.where(Mv <= 10.5)[0]
             Nbf += index.size
             # Clear forces
             X = X[indexk]
             F = F[indexk]
             E = E[indexk]
             Esae = hdt.compute_sae(self.netdict['saefile'], S)
             hidx = np.where(np.abs(E - Esae) > 10.0)
             lidx = np.where(np.abs(E - Esae) <= 10.0)
             if hidx[0].size > 0:
                 print(
                     '  -(' + str(c).zfill(3) +
                     ')High energies detected:\n    ', E[hidx])
             X = X[lidx]
             E = E[lidx]
             F = F[lidx]
             Ndc += E.size
             if (set(S).issubset(self.netdict['atomtyp'])):
                 # Random mask
                 R = np.random.uniform(0.0, 1.0, E.shape[0])
                 idx = np.array([interval(r, N) for r in R])
                 # Build random split lists
                 split = []
                 for j in range(N):
                     split.append([i for i, s in enumerate(idx) if s == j])
                     nd = len([i for i, s in enumerate(idx) if s == j])
                     Nd[j] = Nd[j] + nd
                 # Store data
                 for i, t, v, te in zip(range(N), cachet, cachev, testh5):
                     ## Store training data
                     X_t = np.array(np.concatenate(
                         [X[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     F_t = np.array(np.concatenate(
                         [F[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     E_t = np.array(np.concatenate(
                         [E[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float64)
                     if E_t.shape[0] != 0:
                         t.insertdata(X_t, F_t, E_t, list(S))
                     ## Store Validation
                     if np.array(split[i]).size > 0:
                         X_v = np.array(X[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         F_v = np.array(F[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         E_v = np.array(E[split[i]],
                                        order='C',
                                        dtype=np.float64)
                         if E_v.shape[0] != 0:
                             v.insertdata(X_v, F_v, E_v, list(S))
     # Print some stats
     print('Data count:', Nd)
     print('Data split:', 100.0 * Nd / np.sum(Nd), '%')
     # Save train and valid meta file and cleanup testh5
     for t, v, th in zip(cachet, cachev, testh5):
         t.makemetadata()
         v.makemetadata()
         th.cleanup()
예제 #17
0
import pyanitools as pya
import json

fns = glob('../ANI-1_release/ani*.h5')
frames_different_molecule = []
frames_different_molecule_test = []
frames100_conf_per_mol = []
total_molecules = 22057374
seed = 2020
    
Nstruct = 0

for it,fn in enumerate(fns):
	print(fn)

	adl = pya.anidataloader(fn)
	# Print the species of the data set one by one

	for in_data,data in enumerate(adl):
	    
		# Extract the data
		E = data['energies']
		
		mol_in_the_block = E.shape[0]
		shifts = np.random.RandomState(seed = seed).permutation(np.arange(mol_in_the_block)) 
		frames_different_molecule.append(Nstruct + shifts[0])
		frames_different_molecule_test.append(Nstruct + shifts[1])
		Nstruct += mol_in_the_block
	adl.cleanup()   
	print('Number of molecule in the dataset is ' + str(Nstruct))
frames = {'frames':frames_different_molecule,
#hdf5file = '/home/jujuman/Research/ANI-DATASET/ani-1_data_c03.h5'
storecac = '/home/jujuman/Research/SingleNetworkTest/cache06/'
saef = "/home/jujuman/Research/SingleNetworkTest/sae_6-31gd.dat"
path = "/home/jujuman/Research/SingleNetworkTest/cache06/testset/testset.h5"

# Declare data cache
cachet = cg('_train', saef, storecac, False)
cachev = cg('_valid', saef, storecac, False)

# Declare test cache
dpack = pyt.datapacker(path)

for f in hdf5files:
    # Construct the data loader class
    print(f)
    adl = pyt.anidataloader(f[0])

    print(adl.get_group_list())

    # Loop over data in set
    dc = 0
    for i, data in enumerate(adl):
        #if (i == 2):
        xyz = np.array_split(data['coordinates'], 10)
        eng = np.array_split(data['energies'], 10)
        spc = data['species']
        nme = data['parent']

        #print('Parent: ', nme, eng)
        dc = dc + np.concatenate(eng[0:8]).shape[0]
예제 #19
0
    def build_strided_training_cache(self,
                                     Nblocks,
                                     Nvalid,
                                     Ntest,
                                     build_test=True,
                                     forces=True,
                                     grad=False,
                                     Fkey='forces',
                                     forces_unit=1.0,
                                     Ekey='energies',
                                     energy_unit=1.0,
                                     Eax0sum=False):
        if not os.path.isfile(self.netdict['saefile']):
            self.sae_linear_fitting(Ekey=Ekey,
                                    energy_unit=energy_unit,
                                    Eax0sum=Eax0sum)

        h5d = self.h5dir

        store_dir = self.train_root + "cache-data-"
        N = self.Nn
        Ntrain = Nblocks - Nvalid - Ntest

        if Nblocks % N != 0:
            raise ValueError(
                'Error: number of networks must evenly divide number of blocks.'
            )

        Nstride = Nblocks / N

        for i in range(N):
            if not os.path.exists(store_dir + str(i)):
                os.mkdir(store_dir + str(i))

            if build_test:
                if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                                  str(i) + '.h5'):
                    os.remove(store_dir + str(i) + '/../testset/testset' +
                              str(i) + '.h5')

                if not os.path.exists(store_dir + str(i) + '/../testset'):
                    os.mkdir(store_dir + str(i) + '/../testset')

        cachet = [
            cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]
        cachev = [
            cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]

        if build_test:
            testh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                               str(r) + '.h5') for r in range(N)
            ]

        E = []
        data_count = np.zeros((N, 3), dtype=np.int32)
        for f in self.h5file:
            adl = pyt.anidataloader(h5d + f)
            for data in adl:
                #print(data['path'],data['energies'].size)

                S = data['species']

                if data[Ekey].size > 0 and (set(S).issubset(
                        self.netdict['atomtyp'])):

                    X = np.array(data['coordinates'],
                                 order='C',
                                 dtype=np.float32)

                    if Eax0sum:
                        E = energy_unit * np.sum(np.array(
                            data[Ekey], order='C', dtype=np.float64),
                                                 axis=1)
                    else:
                        E = energy_unit * np.array(
                            data[Ekey], order='C', dtype=np.float64)

                    if forces and not grad:
                        F = forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    if forces and grad:
                        F = -forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    else:
                        F = 0.0 * X

                    # Build random split index
                    ridx = np.random.randint(0, Nblocks, size=E.size)
                    Didx = [
                        np.argsort(ridx)[np.where(ridx == i)]
                        for i in range(Nblocks)
                    ]

                    # Build training cache
                    for nid, cache in enumerate(cachet):
                        set_idx = np.concatenate([
                            Didx[((bid + nid * int(Nstride)) % Nblocks)]
                            for bid in range(Ntrain)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 0] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    for nid, cache in enumerate(cachev):
                        set_idx = np.concatenate([
                            Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks]
                            for bid in range(Nvalid)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 1] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    if build_test:
                        for nid, th5 in enumerate(testh5):
                            set_idx = np.concatenate([
                                Didx[(Ntrain + Nvalid + bid +
                                      nid * int(Nstride)) % Nblocks]
                                for bid in range(Ntest)
                            ])
                            if set_idx.size != 0:
                                data_count[nid, 2] += set_idx.size
                                th5.store_data(f + data['path'],
                                               coordinates=X[set_idx],
                                               forces=F[set_idx],
                                               energies=E[set_idx],
                                               species=list(S))

        # Save train and valid meta file and cleanup testh5
        for t, v in zip(cachet, cachev):
            t.makemetadata()
            v.makemetadata()

        if build_test:
            for th in testh5:
                th.cleanup()

        print(' Train ', ' Valid ', ' Test ')
        print(data_count)
        print('Training set built.')
예제 #20
0
import hdnntools as hdt
import pyanitools as pyt
import os

file = '/home/jujuman/Research/DataReductionMethods/model6/model0.05me/ani_red_c06.h5'
sdir = '/home/jujuman/Research/GDB-11-AL-wB97x631gd/'

aload = pyt.anidataloader(file)

for data in aload:

    X = data['coordinates']
    S = data['species']
    P = data['path']

    parent = P.split('/')[1]
    index  = P.split('/')[2].split('mol')[1].zfill(7)

    path = sdir+parent
    if not os.path.exists(path):
        os.mkdir(path)

    print(path + '/' + parent + '-' + index + '.xyz','DATA:',X.shape[0])
    hdt.writexyzfile(path+'/'+parent+'-'+index+'.xyz',X,S)
import os
import pickle
import pyanitools
from neurochem_calculator import NeuroChem, path
import tqdm

neurochem = NeuroChem()

# generate expect for ANI1 subset
mol_count = 0
for i in [1, 2, 3, 4]:
    data_file = os.path.join(
        path, '../../dataset/ani1-up_to_gdb4/ani_gdb_s0{}.h5'.format(i))
    adl = pyanitools.anidataloader(data_file)
    for data in tqdm.tqdm(adl, desc='ANI1: {} heavy atoms'.format(i)):
        coordinates = data['coordinates'][:10, :]
        pickleobj = neurochem(coordinates, data['species'])
        dumpfile = os.path.join(
            path, '../../tests/test_data/ANI1_subset/{}'.format(mol_count))
        with open(dumpfile, 'wb') as f:
            pickle.dump(pickleobj, f)
        mol_count += 1
예제 #22
0
  def shard_generator():

    shard_size = 4096 * 64

    row_idx = 0
    group_idx = 0

    X_cache = []
    y_cache = []
    w_cache = []
    ids_cache = []

    for hdf5file in hdf5files:
      adl = pya.anidataloader(hdf5file)
      for data in adl:

        # Extract the data
        P = data['path']
        R = data['coordinates']
        E = data['energies']
        S = data['species']
        smi = data['smiles']

        if len(S) > 23:
          print("skipping:", smi, "due to atom count.")
          continue

        # Print the data
        print("Processing: ", P)
        print("  Smiles:      ", "".join(smi))
        print("  Symbols:     ", S)
        print("  Coordinates: ", R.shape)
        print("  Energies:    ", E.shape)

        Z_padded = np.zeros((23,), dtype=np.float32)
        nonpadded = convert_species_to_atomic_nums(S)
        Z_padded[:nonpadded.shape[0]] = nonpadded

        if mode == "relative":
          offset = np.amin(E)
        elif mode == "atomization":

          # self-interaction energies taken from
          # https://github.com/isayev/ANI1_dataset README
          atomizationEnergies = {
              0: 0,
              1: -0.500607632585,
              6: -37.8302333826,
              7: -54.5680045287,
              8: -75.0362229210
          }

          offset = 0

          for z in nonpadded:
            offset -= atomizationEnergies[z]
        elif mode == "absolute":
          offset = 0
        else:
          raise Exception("Unsupported mode: ", mode)

        for k in range(len(E)):
          R_padded = np.zeros((23, 3), dtype=np.float32)
          R_padded[:R[k].shape[0], :R[k].shape[1]] = R[k]

          X = np.concatenate([np.expand_dims(Z_padded, 1), R_padded], axis=1)

          y = E[k] - offset

          if len(X_cache) == shard_size:

            yield np.array(X_cache), np.array(y_cache), np.array(
                w_cache), np.array(ids_cache)

            X_cache = []
            y_cache = []
            w_cache = []
            ids_cache = []

          else:
            X_cache.append(X)
            y_cache.append(np.array(y).reshape((1,)))
            w_cache.append(np.array(1).reshape((1,)))
            ids_cache.append(row_idx)
            row_idx += 1
            groups.append(group_idx)

        group_idx += 1

    # flush once more at the end
    if len(X_cache) > 0:
      yield np.array(X_cache), np.array(y_cache), np.array(w_cache), np.array(
          ids_cache)
예제 #23
0
import pyanitools as pyt

adl = pyt.anidataloader(
    '/home/jujuman/Research/ANI-DATASET/h5data/r10_ccsd.h5')

for i, data in enumerate(adl):
    print(data['energies'])
예제 #24
0
    def shard_generator():

        shard_size = 4096 * 64

        row_idx = 0
        group_idx = 0

        X_cache = []
        y_cache = []
        w_cache = []
        ids_cache = []

        for hdf5file in hdf5files:
            adl = pya.anidataloader(hdf5file)
            for data in adl:

                # Extract the data
                P = data['path']
                R = data['coordinates']
                E = data['energies']
                S = data['species']
                smi = data['smiles']

                if len(S) > 23:
                    print("skipping:", smi, "due to atom count.")
                    continue

                # Print the data
                print("Processing: ", P)
                print("  Smiles:      ", "".join(smi))
                print("  Symbols:     ", S)
                print("  Coordinates: ", R.shape)
                print("  Energies:    ", E.shape)

                Z_padded = np.zeros((23, ), dtype=np.float32)
                nonpadded = convert_species_to_atomic_nums(S)
                Z_padded[:nonpadded.shape[0]] = nonpadded

                if mode == "relative":
                    offset = np.amin(E)
                elif mode == "atomization":

                    # self-interaction energies taken from
                    # https://github.com/isayev/ANI1_dataset README
                    atomizationEnergies = {
                        0: 0,
                        1: -0.500607632585,
                        6: -37.8302333826,
                        7: -54.5680045287,
                        8: -75.0362229210
                    }

                    offset = 0

                    for z in nonpadded:
                        offset -= atomizationEnergies[z]
                elif mode == "absolute":
                    offset = 0
                else:
                    raise Exception("Unsupported mode: ", mode)

                for k in range(len(E)):
                    R_padded = np.zeros((23, 3), dtype=np.float32)
                    R_padded[:R[k].shape[0], :R[k].shape[1]] = R[k]

                    X = np.concatenate([np.expand_dims(Z_padded, 1), R_padded],
                                       axis=1)

                    y = E[k] - offset

                    if len(X_cache) == shard_size:

                        yield np.array(X_cache), np.array(y_cache), np.array(
                            w_cache), np.array(ids_cache)

                        X_cache = []
                        y_cache = []
                        w_cache = []
                        ids_cache = []

                    else:
                        X_cache.append(X)
                        y_cache.append(np.array(y).reshape((1, )))
                        w_cache.append(np.array(1).reshape((1, )))
                        ids_cache.append(row_idx)
                        row_idx += 1
                        groups.append(group_idx)

                group_idx += 1

        # flush once more at the end
        if len(X_cache) > 0:
            yield np.array(X_cache), np.array(y_cache), np.array(
                w_cache), np.array(ids_cache)
예제 #25
0
    def generate_stats(self, maxe=sys.float_info.max, forces=True, grad=False):
        self.tdata = dict()
        for key in self.tsfiles.keys():
            print('   -Working on', key, '...')

            cdata = dict({
                'Eani': [],
                'Edft': [],
                'Fani': [],
                'Fdft': [],
                'dEani': [],
                'dEdft': [],
                'Na': [],
                'Na2': [],
            })

            for file in self.tsfiles[key]:
                adl = ant.anidataloader(file)
                for i, data in enumerate(adl):
                    #if i > 5:
                    #    break
                    if data['coordinates'].shape[0] != 0:
                        Eani, Fani, sig = self.compute_energyandforce_conformations(
                            data['coordinates'],
                            data['species'],
                            ensemble=False)

                        midx = np.where(
                            data['energies'] - data['energies'].min() < maxe /
                            hdt.hatokcal)[0]

                        Eani = Eani[:, midx]
                        Edft = data['energies'][midx]
                        Fani = Fani[:, midx, :, :]
                        if forces:
                            if grad:
                                Fdft = -data['forces'][midx]
                            else:
                                Fdft = data['forces'][midx]
                        else:
                            Fdft = 0.0 * data['coordinates']

                        #Eestd = np.std(Eani, axis=0)/np.sqrt(len(data['species']))
                        Eeani = np.mean(Eani, axis=0).reshape(1, -1)
                        Feani = np.mean(Fani, axis=0).flatten().reshape(1, -1)

                        Fani = Fani.reshape(Fani.shape[0], -1)

                        Eani = np.vstack([Eani, Eeani])
                        Fani = np.vstack([Fani, Feani])

                        Edft = hdt.hatokcal * Edft
                        Fdft = hdt.hatokcal * Fdft.flatten()

                        cdata['Na'].append(
                            np.full(Edft.size,
                                    len(data['species']),
                                    dtype=np.int32))

                        cdata['Eani'].append(Eani)
                        cdata['Edft'].append(Edft)

                        cdata['Fani'].append(Fani)
                        cdata['Fdft'].append(Fdft)

                        #cdata['Frmse'].append(np.sqrt(np.mean((Fani-Fdft).reshape(Fdft.shape[0], -1)**2, axis=1)))
                        #cdata['Frmae'].append(np.sqrt(np.mean(np.abs((Fani - Fdft).reshape(Fdft.shape[0], -1)), axis=1)))

                        cdata['dEani'].append(
                            hdt.calculateKdmat(self.Nn + 1, Eani))
                        cdata['dEdft'].append(hdt.calculatedmat(Edft))

                        cdata['Na2'].append(
                            np.full(cdata['dEdft'][-1].size,
                                    len(data['species']),
                                    dtype=np.int32))

                        #cdata['Erani'].append(Eani-Eani.min())
                        #cdata['Erdft'].append(Edft-Edft.min())

            for k in ['Na', 'Na2', 'Edft', 'Fdft', 'dEdft']:
                cdata[k] = np.concatenate(cdata[k])

            for k in ['Eani', 'Fani', 'dEani']:
                cdata[k] = np.hstack(cdata[k])

            self.tdata.update({key: cdata})
예제 #26
0
import pyanitools as pya

# Set the HDF5 file containing the data
hdf5file = '../ani_gdb_s01.h5'

# Construct the data loader class
adl = pya.anidataloader(hdf5file)

# Print the species of the data set one by one
for data in adl:

    # Extract the data
    P = data['path']
    X = data['coordinates']
    E = data['energies']
    S = data['species']
    sm = data['smiles']

    # Print the data
    print("Path:   ", P)
    print("  Smiles:      ", "".join(sm))
    print("  Symbols:     ", S)
    print("  Coordinates: ", X)
    print("  Energies:    ", E, "\n")

# Closes the H5 data file
adl.cleanup()
예제 #27
0
import pyanitools as pyt
#import pyaniasetools as aat
import numpy as np
import hdnntools as hdt
import os

#import matplotlib.pyplot as plt

file_old = '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0707.0000.0408.h5'
file_new = '/home/jsmith48/scratch/auto_al/h5files_fix/ANI-AL-0707.0000.0408.h5'

print('Working on file:', file_old)
adl = pyt.anidataloader(file_old)

# Data storage
dpack = pyt.datapacker(file_new, mode='w')

for i, data in enumerate(adl):
    #if i == 20:
    #    break
    X = data['coordinates']
    S = data['species']
    Edft = data['energies']
    path = data['path']
    del data['path']

    #Eani, Fani = anicv.compute_energy_conformations(X=np.array(X,dtype=np.float32),S=S)

    Esae = hdt.compute_sae(
        '/home/jsmith48/scratch/auto_al/modelCNOSFCl/sae_wb97x-631gd.dat', S)