def _ao2mo_ovov(mp, orbs, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) orboa = numpy.asarray(orbs[0], order='F') orbva = numpy.asarray(orbs[1], order='F') orbob = numpy.asarray(orbs[2], order='F') orbvb = numpy.asarray(orbs[3], order='F') nao, nocca = orboa.shape noccb = orbob.shape[1] nvira = orbva.shape[1] nvirb = orbvb.shape[1] mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nbas = mol.nbas assert (nvira <= nao) assert (nvirb <= nao) ao_loc = mol.ao_loc_nr() dmax = max( 4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocca)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao)) ftmp = lib.H5TmpFile() disk = (nocca**2 * (nao * (nao + dmax) / 2 + nvira**2) + noccb**2 * (nao * (nao + dmax) / 2 + nvirb**2) + nocca * noccb * (nao**2 + nvira * nvirb)) log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, disk * 8 / 1e6) fint = gto.moleintor.getints4c aa_blk_slices = [] ab_blk_slices = [] count_ab = 0 count_aa = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in sh_ranges: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0, nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = lib.ddot(orboa.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao)) tmp_li = lib.ddot( orbob.T, tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(noccb, nocca, (i1 - i0), (j1 - j0)) save('ab/%d' % count_ab, tmp_li.transpose(1, 0, 2, 3)) ab_blk_slices.append((i0, i1, j0, j1)) count_ab += 1 if ish0 >= jsh0: tmp_li = lib.ddot( orboa.T, tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(nocca, nocca, (i1 - i0), (j1 - j0)) save('aa/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3)) tmp_i = lib.ddot( orbob.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao)) tmp_li = lib.ddot( orbob.T, tmp_i.reshape(noccb * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(noccb, noccb, (i1 - i0), (j1 - j0)) save('bb/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3)) aa_blk_slices.append((i0, i1, j0, j1)) count_aa += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = None fovov = feri.create_dataset('ovov', (nocca * nvira, nocca * nvira), 'f8', chunks=(nvira, nvira)) fovOV = feri.create_dataset('ovOV', (nocca * nvira, noccb * nvirb), 'f8', chunks=(nvira, nvirb)) fOVOV = feri.create_dataset('OVOV', (noccb * nvirb, noccb * nvirb), 'f8', chunks=(nvirb, nvirb)) occblk = int( min(max(nocca, noccb), max(4, 250 / nocca, max_memory * .9e6 / 8 / (nao**2 * nocca) / 5))) def load_aa(h5g, nocc, i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(aa_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(h5g[str(k)][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def load_ab(h5g, nocca, i0, eri): if i0 < nocca: i1 = min(i0 + occblk, nocca) for k, (p0, p1, q0, q1) in enumerate(ab_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1] def save(h5dat, nvir, i0, i1, dat): for i in range(i0, i1): h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape(nvir, -1) with lib.call_in_background(save) as bsave: with lib.call_in_background(load_aa) as prefetch: buf_prefecth = numpy.empty((occblk, nocca, nao, nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['aa'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['aa'], nocca, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocca, nao, nao) dat = _ao2mo.nr_e2(eri, orbva, (0, nvira, 0, nvira), 's1', 's1') bsave( fovov, nvira, i0, i1, dat.reshape(i1 - i0, nocca, nvira, nvira).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for aa [%d:%d]' % (i0, i1), *time1) buf_prefecth = numpy.empty((occblk, noccb, nao, nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['bb'], noccb, 0, buf_prefecth) for i0, i1 in lib.prange(0, noccb, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['bb'], noccb, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao) dat = _ao2mo.nr_e2(eri, orbvb, (0, nvirb, 0, nvirb), 's1', 's1') bsave( fOVOV, nvirb, i0, i1, dat.reshape(i1 - i0, noccb, nvirb, nvirb).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for bb [%d:%d]' % (i0, i1), *time1) orbvab = numpy.asarray(numpy.hstack((orbva, orbvb)), order='F') with lib.call_in_background(load_ab) as prefetch: load_ab(ftmp['ab'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['ab'], nocca, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao) dat = _ao2mo.nr_e2(eri, orbvab, (0, nvira, nvira, nvira + nvirb), 's1', 's1') bsave( fovOV, nvira, i0, i1, dat.reshape(i1 - i0, noccb, nvira, nvirb).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for ab [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nao, nocc = orbo.shape nvir = orbv.shape[1] nbas = mol.nbas assert (nvir <= nao) ao_loc = mol.ao_loc_nr() dmax = max( 4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocc)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc**2 * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) buf_i = numpy.empty((nocc * dmax**2 * nao)) buf_li = numpy.empty((nocc**2 * dmax**2)) buf1 = numpy.empty_like(buf_li) fint = gto.moleintor.getints4c jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0, nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = numpy.ndarray((nocc, (i1 - i0) * (j1 - j0) * nao), buffer=buf_i) tmp_li = numpy.ndarray((nocc, nocc * (i1 - i0) * (j1 - j0)), buffer=buf_li) lib.ddot(orbo.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao), c=tmp_i) lib.ddot(orbo.T, tmp_i.reshape(nocc * (i1 - i0) * (j1 - j0), nao).T, c=tmp_li) tmp_li = tmp_li.reshape(nocc, nocc, (i1 - i0), (j1 - j0)) save(str(count), tmp_li.transpose(1, 0, 2, 3)) buf_li, buf1 = buf1, buf_li count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None h5dat = feri.create_dataset('ovov', (nocc * nvir, nocc * nvir), 'f8', chunks=(nvir, nvir)) occblk = int( min(nocc, max(4, 250 / nocc, max_memory * .9e6 / 8 / (nao**2 * nocc) / 5))) def load(i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def save(i0, i1, dat): for i in range(i0, i1): h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape( nvir, nocc * nvir) orbv = numpy.asarray(orbv, order='F') buf_prefecth = numpy.empty((occblk, nocc, nao, nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk * nocc, nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocc, nao, nao) dat = _ao2mo.nr_e2(eri, orbv, (0, nvir, 0, nvir), 's1', 's1', out=bufw) bsave( i0, i1, dat.reshape(i1 - i0, nocc, nvir, nvir).transpose(0, 2, 1, 3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) return h5dat
def half_e1(mol, mo_coeffs, swapfile, intor='int2e', aosym='s4', comp=1, max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN, compact=True, ao2mopt=None): r'''Half transform arbitrary spherical AO integrals to MO integrals for the given two sets of orbitals Args: mol : :class:`Mole` object AO integrals will be generated in terms of mol._atm, mol._bas, mol._env mo_coeff : ndarray Transform (ij|kl) with the same set of orbitals. swapfile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. The transformed integrals are saved in blocks. Kwargs intor : str Name of the 2-electron integral. Ref to :func:`getints_by_shell` for the complete list of available 2-electron integral names aosym : int or str Permutation symmetry for the AO integrals | 4 or '4' or 's4': 4-fold symmetry (default) | '2ij' or 's2ij' : symmetry between i, j in (ij|kl) | '2kl' or 's2kl' : symmetry between k, l in (ij|kl) | 1 or '1' or 's1': no symmetry | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO) | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO) | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO) | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO) comp : int Components of the integrals, e.g. int2e_ip_sph has 3 components. verbose : int Print level max_memory : float or int The maximum size of cache to use (in MB), large cache may **not** improve performance. ioblk_size : float or int The block size for IO, large block size may **not** improve performance verbose : int Print level compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals ao2mopt : :class:`AO2MOpt` object Precomputed data to improve perfomance Returns: None ''' if any(c.dtype == numpy.complex128 for c in mo_coeffs): raise NotImplementedError('Integral transformation for complex orbitals') intor = mol._add_suffix(intor) time0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) nao = mo_coeffs[0].shape[0] aosym = _stand_sym_code(aosym) if aosym in ('s4', 's2ij'): nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao ijmosym, nij_pair, moij, ijshape = \ incore._conc_mos(mo_coeffs[0], mo_coeffs[1], compact and aosym in ('s4', 's2ij')) e1buflen, mem_words, iobuf_words, ioblk_words = \ guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp) ioblk_size = ioblk_words * 8/1e6 # The buffer to hold AO integrals in C code, see line (@) aobuflen = max(int((mem_words - 2*comp*e1buflen*nij_pair) // (nao_pair*comp)), IOBUF_ROW_MIN) ao_loc = mol.ao_loc_nr('_cart' in intor) shranges = guess_shell_ranges(mol, (aosym in ('s4', 's2kl')), e1buflen, aobuflen, ao_loc) if ao2mopt is None: if intor == 'int2e_cart' or intor == 'int2e_sph': ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') else: ao2mopt = _ao2mo.AO2MOpt(mol, intor) if isinstance(swapfile, h5py.Group): fswap = swapfile else: fswap = lib.H5TmpFile(swapfile) for icomp in range(comp): fswap.create_group(str(icomp)) # for h5py old version log.debug('step1: tmpfile %s %.8g MB', fswap.filename, nij_pair*nao_pair*8/1e6) log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB', nij_pair, nao_pair, mem_words*8/1e6, iobuf_words*8/1e6) nstep = len(shranges) e1buflen = max([x[2] for x in shranges]) e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, e1buflen) def save(istep, iobuf): for icomp in range(comp): _transpose_to_h5g(fswap, '%d/%d'%(icomp,istep), iobuf[icomp], e2buflen, None) # transform e1 ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0) with lib.call_in_background(save) as async_write: buf1 = numpy.empty((comp*e1buflen,nao_pair)) buf2 = numpy.empty((comp*e1buflen,nij_pair)) buf_write = numpy.empty_like(buf2) fill = _ao2mo.nr_e1fill f_e1 = _ao2mo.nr_e1 for istep,sh_range in enumerate(shranges): log.debug1('step 1 [%d/%d], AO [%d:%d], len(buf) = %d', istep+1, nstep, *(sh_range[:3])) buflen = sh_range[2] iobuf = numpy.ndarray((comp,buflen,nij_pair), buffer=buf2) nmic = len(sh_range[3]) p1 = 0 for imic, aoshs in enumerate(sh_range[3]): log.debug2(' fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d', imic+1, nmic, *aoshs) buf = fill(intor, aoshs, mol._atm, mol._bas, mol._env, aosym, comp, ao2mopt, out=buf1).reshape(-1,nao_pair) buf = f_e1(buf, moij, ijshape, aosym, ijmosym) p0, p1 = p1, p1 + aoshs[2] iobuf[:,p0:p1] = buf.reshape(comp,aoshs[2],nij_pair) ti0 = log.timer_debug1('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0) async_write(istep, iobuf) buf2, buf_write = buf_write, buf2 fswap = None return swapfile
def _make_ao_ints(mol, mo_coeff, nocc, dtype): NS = ctf.SYM.NS SY = ctf.SYM.SY ao_loc = mol.ao_loc_nr() mo = np.asarray(mo_coeff, order='F') nao, nmo = mo.shape nvir = nmo - nocc ppoo = ctf.tensor((nao, nao, nocc, nocc), sym=[SY, NS, NS, NS], dtype=dtype) ppov = ctf.tensor((nao, nao, nocc, nvir), sym=[SY, NS, NS, NS], dtype=dtype) ppvv = ctf.tensor((nao, nao, nvir, nvir), sym=[SY, NS, SY, NS], dtype=dtype) intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') blksize = int(max(4, min(nao / 3, nao / size**.5, 2000e6 / 8 / nao**3))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, blksize) tasks = [] for k, (ish0, ish1, di) in enumerate(sh_ranges): for jsh0, jsh1, dj in sh_ranges[:k + 1]: tasks.append((ish0, ish1, jsh0, jsh1)) sqidx = np.arange(nao**2).reshape(nao, nao) trilidx = sqidx[np.tril_indices(nao)] vsqidx = np.arange(nvir**2).reshape(nvir, nvir) vtrilidx = vsqidx[np.tril_indices(nvir)] subtasks = list(static_partition(tasks)) ntasks = max(comm.allgather(len(subtasks))) for itask in range(ntasks): if itask >= len(subtasks): ppoo.write([], []) ppov.write([], []) ppvv.write([], []) continue shls_slice = subtasks[itask] ish0, ish1, jsh0, jsh1 = shls_slice i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] di = i1 - i0 dj = j1 - j0 if i0 != j0: eri = gto.moleintor.getints4c(intor, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt) idx = sqidx[i0:i1, j0:j1].ravel() eri = _ao2mo.nr_e2(eri.reshape(di * dj, -1), mo, (0, nmo, 0, nmo), 's2kl', 's1') else: eri = gto.moleintor.getints4c(intor, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s4', ao_loc=ao_loc, cintopt=ao2mopt._cintopt) eri = _ao2mo.nr_e2(eri, mo, (0, nmo, 0, nmo), 's4', 's1') idx = sqidx[i0:i1, j0:j1][np.tril_indices(i1 - i0)] ooidx = idx[:, None] * nocc**2 + np.arange(nocc**2) ovidx = idx[:, None] * (nocc * nvir) + np.arange(nocc * nvir) vvidx = idx[:, None] * nvir**2 + vtrilidx eri = eri.reshape(-1, nmo, nmo) ppoo.write(ooidx.ravel(), eri[:, :nocc, :nocc].ravel()) ppov.write(ovidx.ravel(), eri[:, :nocc, nocc:].ravel()) ppvv.write(vvidx.ravel(), pyscflib.pack_tril(eri[:, nocc:, nocc:]).ravel()) idx = eri = None return ppoo, ppov, ppvv
def trans_e1_outcore(mol, mo, ncore, ncas, erifile, max_memory=None, level=1, verbose=logger.WARN): time0 = (time.clock(), time.time()) if isinstance(verbose, logger.Logger): log = verbose else: log = logger.Logger(mol.stdout, verbose) log.debug1('trans_e1_outcore level %d max_memory %d', level, max_memory) nao, nmo = mo.shape nao_pair = nao * (nao + 1) // 2 nocc = ncore + ncas _tmpfile1 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR) faapp_buf = h5py.File(_tmpfile1.name) feri = h5py.File(erifile, 'w') mo_c = numpy.asarray(mo, order='C') mo = numpy.asarray(mo, order='F') pashape = (0, nmo, ncore, nocc) papa_buf = numpy.zeros((nao, ncas, nmo * ncas)) j_pc = numpy.zeros((nmo, ncore)) k_pc = numpy.zeros((nmo, ncore)) mem_words = int(max(2000, max_memory - papa_buf.nbytes / 1e6) * 1e6 / 8) aobuflen = mem_words // (nao_pair + nocc * nmo) + 1 ao_loc = numpy.array(mol.ao_loc_nr(), dtype=numpy.int32) shranges = outcore.guess_shell_ranges(mol, True, aobuflen, None, ao_loc) intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nstep = len(shranges) paapp = 0 maxbuflen = max([x[2] for x in shranges]) log.debug('mem_words %.8g MB, maxbuflen = %d', mem_words * 8 / 1e6, maxbuflen) bufs1 = numpy.empty((maxbuflen, nao_pair)) bufs2 = numpy.empty((maxbuflen, nmo * ncas)) if level == 1: bufs3 = numpy.empty((maxbuflen, nao * ncore)) log.debug('mem cache %.8g MB', (bufs1.nbytes + bufs2.nbytes + bufs3.nbytes) / 1e6) else: log.debug('mem cache %.8g MB', (bufs1.nbytes + bufs2.nbytes) / 1e6) ti0 = log.timer('Initializing trans_e1_outcore', *time0) # fmmm, ftrans, fdrv for level 1 fmmm = libmcscf.AO2MOmmm_ket_nr_s2 ftrans = libmcscf.AO2MOtranse1_nr_s4 fdrv = libmcscf.AO2MOnr_e2_drv for istep, sh_range in enumerate(shranges): log.debug('[%d/%d], AO [%d:%d], len(buf) = %d', istep + 1, nstep, *sh_range) buf = bufs1[:sh_range[2]] _ao2mo.nr_e1fill(intor, sh_range, mol._atm, mol._bas, mol._env, 's4', 1, ao2mopt, buf) if log.verbose >= logger.DEBUG1: ti1 = log.timer('AO integrals buffer', *ti0) bufpa = bufs2[:sh_range[2]] _ao2mo.nr_e1(buf, mo, pashape, 's4', 's1', out=bufpa) # jc_pp, kc_pp if level == 1: # ppaa, papa and vhf, jcp, kcp if log.verbose >= logger.DEBUG1: ti1 = log.timer('buffer-pa', *ti1) buf1 = bufs3[:sh_range[2]] fdrv(ftrans, fmmm, buf1.ctypes.data_as(ctypes.c_void_p), buf.ctypes.data_as(ctypes.c_void_p), mo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(sh_range[2]), ctypes.c_int(nao), (ctypes.c_int * 4)(0, nao, 0, ncore), ctypes.POINTER(ctypes.c_void_p)(), ctypes.c_int(0)) p0 = 0 for ij in range(sh_range[0], sh_range[1]): i, j = _ao2mo._extract_pair(ij) i0 = ao_loc[i] j0 = ao_loc[j] i1 = ao_loc[i + 1] j1 = ao_loc[j + 1] di = i1 - i0 dj = j1 - j0 if i == j: dij = di * (di + 1) // 2 buf = numpy.empty((di, di, nao * ncore)) idx = numpy.tril_indices(di) buf[idx] = buf1[p0:p0 + dij] buf[idx[1], idx[0]] = buf1[p0:p0 + dij] buf = buf.reshape(di, di, nao, ncore) mo1 = mo_c[i0:i1] tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:, :ncore]) tmp = lib.dot(mo1.T, tmp.reshape(di, -1)) j_pc += numpy.einsum('vp,pvc->pc', mo1, tmp.reshape(nmo, di, ncore)) tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:, :ncore]) tmp = lib.dot(tmp.reshape(-1, nmo), mo).reshape(di, ncore, nmo) k_pc += numpy.einsum('vp,vcp->pc', mo1, tmp) else: dij = di * dj buf = buf1[p0:p0 + dij].reshape(di, dj, nao, ncore) mo1 = mo_c[i0:i1] mo2 = mo_c[j0:j1] tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:, :ncore]) tmp = lib.dot(mo1.T, tmp.reshape(di, -1)) j_pc += numpy.einsum('vp,pvc->pc', mo2, tmp.reshape(nmo, dj, ncore)) * 2 tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:, :ncore]) tmp = lib.dot(tmp.reshape(-1, nmo), mo).reshape(dj, ncore, nmo) k_pc += numpy.einsum('vp,vcp->pc', mo2, tmp) tmp = numpy.einsum('uvpc,vc->ucp', buf, mo2[:, :ncore]) tmp = lib.dot(tmp.reshape(-1, nmo), mo).reshape(di, ncore, nmo) k_pc += numpy.einsum('up,ucp->pc', mo1, tmp) p0 += dij if log.verbose >= logger.DEBUG1: ti1 = log.timer('j_cp and k_cp', *ti1) if log.verbose >= logger.DEBUG1: ti1 = log.timer('half transformation of the buffer', *ti1) # ppaa, papa faapp_buf[str(istep)] = \ bufpa.reshape(sh_range[2],nmo,ncas)[:,ncore:nocc].reshape(-1,ncas**2).T p0 = 0 for ij in range(sh_range[0], sh_range[1]): i, j = _ao2mo._extract_pair(ij) i0 = ao_loc[i] j0 = ao_loc[j] i1 = ao_loc[i + 1] j1 = ao_loc[j + 1] di = i1 - i0 dj = j1 - j0 if i == j: dij = di * (di + 1) // 2 buf1 = numpy.empty((di, di, nmo * ncas)) idx = numpy.tril_indices(di) buf1[idx] = bufpa[p0:p0 + dij] buf1[idx[1], idx[0]] = bufpa[p0:p0 + dij] else: dij = di * dj buf1 = bufpa[p0:p0 + dij].reshape(di, dj, -1) mo1 = mo[j0:j1, ncore:nocc].copy() for i in range(di): lib.dot(mo1.T, buf1[i], 1, papa_buf[i0 + i], 1) mo1 = mo[i0:i1, ncore:nocc].copy() buf1 = lib.dot(mo1.T, buf1.reshape(di, -1)) papa_buf[j0:j1] += buf1.reshape(ncas, dj, -1).transpose(1, 0, 2) p0 += dij if log.verbose >= logger.DEBUG1: ti1 = log.timer('ppaa and papa buffer', *ti1) ti0 = log.timer('gen AO/transform MO [%d/%d]' % (istep + 1, nstep), *ti0) buf = buf1 = bufpa = None bufs1 = bufs2 = bufs3 = None time1 = log.timer('mc_ao2mo pass 1', *time0) log.debug1('Half transformation done. Current memory %d', lib.current_memory()[0]) nblk = int( max(8, min(nmo, (max_memory * 1e6 / 8 - papa_buf.size) / (ncas**2 * nmo)))) log.debug1('nblk for papa = %d', nblk) dset = feri.create_dataset('papa', (nmo, ncas, nmo, ncas), 'f8') for i0, i1 in prange(0, nmo, nblk): tmp = lib.dot(mo[:, i0:i1].T, papa_buf.reshape(nao, -1)) dset[i0:i1] = tmp.reshape(i1 - i0, ncas, nmo, ncas) papa_buf = tmp = None time1 = log.timer('papa pass 2', *time1) tmp = numpy.empty((ncas**2, nao_pair)) p0 = 0 for istep, sh_range in enumerate(shranges): tmp[:, p0:p0 + sh_range[2]] = faapp_buf[str(istep)] p0 += sh_range[2] nblk = int( max(8, min(nmo, (max_memory * 1e6 / 8 - tmp.size) / (ncas**2 * nmo) - 1))) log.debug1('nblk for ppaa = %d', nblk) dset = feri.create_dataset('ppaa', (nmo, nmo, ncas, ncas), 'f8') for i0, i1 in prange(0, nmo, nblk): tmp1 = _ao2mo.nr_e2(tmp, mo, (i0, i1, 0, nmo), 's4', 's1', ao_loc=ao_loc) tmp1 = tmp1.reshape(ncas, ncas, i1 - i0, nmo) for j in range(i1 - i0): dset[i0 + j] = tmp1[:, :, j].transpose(2, 0, 1) tmp = tmp1 = None time1 = log.timer('ppaa pass 2', *time1) faapp_buf.close() feri.close() _tmpfile1 = None time0 = log.timer('mc_ao2mo', *time0) return j_pc, k_pc
def half_e1(mol, mo_coeffs, swapfile, intor='int2e_spinor', aosym='s4', comp=None, max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN, ao2mopt=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mol, verbose) ijsame = iden_coeffs(mo_coeffs[0], mo_coeffs[1]) nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nao = mo_coeffs[0].shape[0] aosym = outcore._stand_sym_code(aosym) if aosym in ('s1', 's2kl', 'a2kl'): nao_pair = nao * nao else: nao_pair = _count_naopair(mol, nao) nij_pair = nmoi * nmoj if ijsame and aosym in ('s4', 's2ij', 'a2ij', 'a4ij', 'a4kl', 'a4'): log.debug('i-mo == j-mo') moij = numpy.asarray(mo_coeffs[0], order='F') ijshape = (0, nmoi, 0, nmoi) else: moij = numpy.asarray(numpy.hstack((mo_coeffs[0],mo_coeffs[1])), order='F') ijshape = (0, nmoi, nmoi, nmoi+nmoj) e1buflen, mem_words, iobuf_words, ioblk_words = \ guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp) # The buffer to hold AO integrals in C code aobuflen = int((mem_words - iobuf_words) // (nao*nao*comp)) shranges = outcore.guess_shell_ranges(mol, (aosym not in ('s1', 's2ij', 'a2ij')), aobuflen, e1buflen, mol.ao_loc_2c(), False) if ao2mopt is None: # if intor == 'int2e_spinor': # ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', # 'CVHFsetnr_direct_scf') # elif intor == 'int2e_spsp1_spinor': # elif intor == 'int2e_spsp1spsp2_spinor': # else: # ao2mopt = _ao2mo.AO2MOpt(mol, intor) ao2mopt = _ao2mo.AO2MOpt(mol, intor) log.debug('step1: tmpfile %.8g MB', nij_pair*nao_pair*16/1e6) log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB', nij_pair, nao_pair, mem_words*16/1e6, iobuf_words*16/1e6) fswap = h5py.File(swapfile, 'w') for icomp in range(comp): g = fswap.create_group(str(icomp)) # for h5py old version tao = numpy.asarray(mol.tmap(), dtype=numpy.int32) # transform e1 ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0) nstep = len(shranges) for istep,sh_range in enumerate(shranges): log.debug('step 1 [%d/%d], AO [%d:%d], len(buf) = %d', \ istep+1, nstep, *(sh_range[:3])) buflen = sh_range[2] iobuf = numpy.empty((comp,buflen,nij_pair), dtype=numpy.complex) nmic = len(sh_range[3]) p0 = 0 for imic, aoshs in enumerate(sh_range[3]): log.debug1(' fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d', \ imic+1, nmic, *aoshs) buf = _ao2mo.r_e1(intor, moij, ijshape, aoshs, mol._atm, mol._bas, mol._env, tao, aosym, comp, ao2mopt) iobuf[:,p0:p0+aoshs[2]] = buf p0 += aoshs[2] ti2 = log.timer('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0) e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, buflen) for icomp in range(comp): dset = fswap.create_dataset('%d/%d'%(icomp,istep), (nij_pair,iobuf.shape[1]), 'c16', chunks=None) for col0, col1 in prange(0, nij_pair, e2buflen): dset[col0:col1] = lib.transpose(iobuf[icomp,:,col0:col1]) ti0 = log.timer('transposing to disk', *ti2) fswap.close() return swapfile
def _contract_vvvv_t2(mycc, vvvv, t2T, task_locs, out=None, verbose=None): '''Ht2 = numpy.einsum('ijcd,acbd->ijab', t2, vvvv) where vvvv has to be real and has the 4-fold permutation symmetry Args: vvvv : None or integral object if vvvv is None, contract t2 to AO-integrals using AO-direct algorithm ''' time0 = time.clock(), time.time() mol = mycc.mol log = logger.new_logger(mycc, verbose) if callable(t2T): t2T = t2T() assert (t2T.dtype == numpy.double) nvira, nvirb = t2T.shape[:2] nvir2 = nvira * nvirb t2T = t2T.reshape(nvira, nvirb, -1) nocc2 = t2T.shape[2] Ht2 = numpy.ndarray(t2T.shape, dtype=t2T.dtype, buffer=out) Ht2[:] = 0 _dgemm = lib.numpy_helper._dgemm def contract_blk_(Ht2, t2T, eri, i0, i1, j0, j1): ic = i1 - i0 jc = j1 - j0 #:Ht2[j0:j1] += numpy.einsum('efx,efab->abx', t2T[i0:i1], eri) _dgemm('T', 'N', jc * nvirb, nocc2, ic * nvirb, eri.reshape(ic * nvirb, jc * nvirb), t2T.reshape(-1, nocc2), Ht2.reshape(nvir2, nocc2), 1, 1, 0, i0 * nvirb * nocc2, j0 * nvirb * nocc2) max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) if vvvv is None: # AO-direct CCSD ao_loc = mol.ao_loc_nr() intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') blksize = max(BLKMIN, numpy.sqrt(max_memory * .9e6 / 8 / nvirb**2 / 2)) fint = gto.moleintor.getints4c fload = ccsd._ccsd.libcc.CCload_eri ntasks = mpi.pool.size task_sh_locs = task_locs sh_ranges_tasks = [] for task in range(ntasks): sh0 = task_sh_locs[task] sh1 = task_sh_locs[task + 1] sh_ranges = ao2mo.outcore.balance_partition( ao_loc, blksize, sh0, sh1) sh_ranges_tasks.append(sh_ranges) blksize = max( max(x[2] for x in sh_ranges) if sh_ranges else 0 for sh_ranges in sh_ranges_tasks) eribuf = numpy.empty((blksize, blksize, nvirb, nvirb)) loadbuf = numpy.empty((blksize, blksize, nvirb, nvirb)) ao_sh_ranges = sh_ranges_tasks[rank] ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_offset = ao_loc[ao_sh0] assert (nvira == ao_loc[ao_sh1] - ao_loc[ao_sh0]) for task_id, t2T in _rotate_tensor_block(t2T): sh_ranges = sh_ranges_tasks[task_id] sh0 = task_sh_locs[task_id] cur_offset = ao_loc[sh0] for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in ao_sh_ranges: eri = fint(intor, mol._atm, mol._bas, mol._env, shls_slice=(ish0, ish1, jsh0, jsh1), aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0] - cur_offset, ao_loc[ ish1] - cur_offset j0, j1 = ao_loc[jsh0] - ao_offset, ao_loc[jsh1] - ao_offset tmp = numpy.ndarray((i1 - i0, nvirb, j1 - j0, nvirb), buffer=loadbuf) fload(tmp.ctypes.data_as(ctypes.c_void_p), eri.ctypes.data_as(ctypes.c_void_p), (ctypes.c_int * 4)(i0, i1, j0, j1), ctypes.c_int(nvirb)) contract_blk_(Ht2, t2T, tmp, i0, i1, j0, j1) time0 = log.timer_debug1( 'AO-vvvv [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time0) else: raise NotImplementedError return Ht2
def add_wvvVV_(self, t1, t2, eris, t2new_tril, with_ovvv=True): time0 = time.clock(), time.time() nocc, nvir = t1.shape #: tau = t2 + numpy.einsum('ia,jb->ijab', t1, t1) #: t2new += numpy.einsum('ijcd,acdb->ijab', tau, vvvv) def contract_rec_(t2new_tril, tau, eri, i0, i1, j0, j1): nao = tau.shape[-1] ic = i1 - i0 jc = j1 - j0 #: t2tril[:,j0:j1] += numpy.einsum('xcd,cdab->xab', tau[:,i0:i1], eri) _dgemm('N', 'N', nocc*(nocc+1)//2, jc*nao, ic*nao, tau.reshape(-1,nao*nao), eri.reshape(-1,jc*nao), t2new_tril.reshape(-1,nao*nao), 1, 1, i0*nao, 0, j0*nao) #: t2tril[:,i0:i1] += numpy.einsum('xcd,abcd->xab', tau[:,j0:j1], eri) _dgemm('N', 'T', nocc*(nocc+1)//2, ic*nao, jc*nao, tau.reshape(-1,nao*nao), eri.reshape(-1,jc*nao), t2new_tril.reshape(-1,nao*nao), 1, 1, j0*nao, 0, i0*nao) def contract_tril_(t2new_tril, tau, eri, a0, a): nvir = tau.shape[-1] #: t2new[i,:i+1, a] += numpy.einsum('xcd,cdb->xb', tau[:,a0:a+1], eri) _dgemm('N', 'N', nocc*(nocc+1)//2, nvir, (a+1-a0)*nvir, tau.reshape(-1,nvir*nvir), eri.reshape(-1,nvir), t2new_tril.reshape(-1,nvir*nvir), 1, 1, a0*nvir, 0, a*nvir) #: t2new[i,:i+1,a0:a] += numpy.einsum('xd,abd->xab', tau[:,a], eri[:a]) if a > a0: _dgemm('N', 'T', nocc*(nocc+1)//2, (a-a0)*nvir, nvir, tau.reshape(-1,nvir*nvir), eri.reshape(-1,nvir), t2new_tril.reshape(-1,nvir*nvir), 1, 1, a*nvir, 0, a0*nvir) if self.direct: # AO-direct CCSD mol = self.mol mo = _mo_without_core(self, self.mo_coeff) nao, nmo = mo.shape nao_pair = nao * (nao+1) // 2 aos = numpy.asarray(mo[:,nocc:].T, order='F') nocc2 = nocc*(nocc+1)//2 outbuf = numpy.empty((nocc2,nao,nao)) tau = numpy.ndarray((nocc2,nvir,nvir), buffer=outbuf) p0 = 0 for i in range(nocc): tau[p0:p0+i+1] = numpy.einsum('a,jb->jab', t1[i], t1[:i+1]) tau[p0:p0+i+1] += t2[i,:i+1] p0 += i + 1 tau = _ao2mo.nr_e2(tau.reshape(nocc2,nvir**2), aos, (0,nao,0,nao), 's1', 's1') tau = tau.reshape(nocc2,nao,nao) time0 = logger.timer_debug1(self, 'vvvv-tau', *time0) ao2mopt = _ao2mo.AO2MOpt(mol, 'cint2e_sph', 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') outbuf[:] = 0 ao_loc = mol.ao_loc_nr() max_memory = max(0, self.max_memory - lib.current_memory()[0]) dmax = max(4, int(numpy.sqrt(max_memory*.95e6/8/nao**2/2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((dmax,dmax,nao,nao)) loadbuf = numpy.empty((dmax,dmax,nao,nao)) fint = gto.moleintor.getints2e for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip]: eri = fint('cint2e_sph', mol._atm, mol._bas, mol._env, shls_slice=(ish0,ish1,jsh0,jsh1), aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] tmp = numpy.ndarray((i1-i0,nao,j1-j0,nao), buffer=loadbuf) _ccsd.libcc.CCload_eri(tmp.ctypes.data_as(ctypes.c_void_p), eri.ctypes.data_as(ctypes.c_void_p), (ctypes.c_int*4)(i0, i1, j0, j1), ctypes.c_int(nao)) contract_rec_(outbuf, tau, tmp, i0, i1, j0, j1) time0 = logger.timer_debug1(self, 'AO-vvvv [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time0) eri = fint('cint2e_sph', mol._atm, mol._bas, mol._env, shls_slice=(ish0,ish1,ish0,ish1), aosym='s4', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0], ao_loc[ish1] for i in range(i1-i0): p0, p1 = i*(i+1)//2, (i+1)*(i+2)//2 tmp = lib.unpack_tril(eri[p0:p1], out=loadbuf) contract_tril_(outbuf, tau, tmp, i0, i0+i) time0 = logger.timer_debug1(self, 'AO-vvvv [%d:%d,%d:%d]' % (ish0,ish1,ish0,ish1), *time0) eribuf = loadbuf = eri = tmp = None tmp = _ao2mo.nr_e2(outbuf, mo, (nocc,nmo,nocc,nmo), 's1', 's1', out=tau) t2new_tril += tmp.reshape(nocc2,nvir,nvir) if with_ovvv: #: tmp = numpy.einsum('ijcd,ka,kdcb->ijba', tau, t1, eris.ovvv) #: t2new -= tmp + tmp.transpose(1,0,3,2) tmp = _ao2mo.nr_e2(outbuf, mo, (nocc,nmo,0,nocc), 's1', 's1', out=tau) t2new_tril -= lib.ddot(tmp.reshape(nocc2*nvir,nocc), t1).reshape(nocc2,nvir,nvir) tmp = _ao2mo.nr_e2(outbuf, mo, (0,nocc,nocc,nmo), 's1', 's1', out=tau) #: t2new_tril -= numpy.einsum('xkb,ka->xab', tmp.reshape(-1,nocc,nvir), t1) tmp = lib.transpose(tmp.reshape(nocc2,nocc,nvir), axes=(0,2,1), out=outbuf) tmp = lib.ddot(tmp.reshape(nocc2*nvir,nocc), t1, 1, numpy.ndarray((nocc2*nvir,nvir), buffer=tau), 0) tmp = lib.transpose(tmp.reshape(nocc2,nvir,nvir), axes=(0,2,1), out=outbuf) t2new_tril -= tmp.reshape(nocc2,nvir,nvir) else: #: tau = t2 + numpy.einsum('ia,jb->ijab', t1, t1) #: t2new += numpy.einsum('ijcd,acdb->ijab', tau, vvvv) tau = numpy.empty((nocc*(nocc+1)//2,nvir,nvir)) p0 = 0 for i in range(nocc): tau[p0:p0+i+1] = numpy.einsum('a,jb->jab', t1[i], t1[:i+1]) tau[p0:p0+i+1] += t2[i,:i+1] p0 += i + 1 time0 = logger.timer_debug1(self, 'vvvv-tau', *time0) p0 = 0 outbuf = numpy.empty((nvir,nvir,nvir)) outbuf1 = numpy.empty((nvir,nvir,nvir)) handler = None for a in range(nvir): buf = lib.unpack_tril(eris.vvvv[p0:p0+a+1], out=outbuf) outbuf, outbuf1 = outbuf1, outbuf handler = async_do(handler, contract_tril_, t2new_tril, tau, buf, 0, a) p0 += a+1 time0 = logger.timer_debug1(self, 'vvvv %d'%a, *time0) handler.join() return t2new_tril
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (time.clock(), time.time()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert (nvir <= nao) orbo = eris.mo_coeff[:, :nocc] orbv = numpy.asarray(eris.mo_coeff[:, nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory * .9e6 / 8 / ((nao + nocc) * (nao_seg + nocc))) dmax = min(nao // 4 + 2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc * nocc_seg * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg, nocc, di, dj) ftmp[str(count) + 'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:, p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [ tmp_ox[i].reshape(p1 - p0, nocc_seg, di, dj) for i, (p0, p1) in enumerate(olocs) ] ftmp[str(count) + 'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) shls_slice = (0, mol.nbas, ish0, ish1, jsh0, jsh1, ao_sh0, ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc, nvir, nocc_seg, nvir), 'f8') occblk = int( min(nocc, max(BLKMIN, max_memory * .9e6 / 8 / (nao**2 * nocc_seg + 1) / 5))) def load(i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k) + 'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k) + 'b'][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk, nocc_seg, nao, nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk * nocc_seg, nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocc_seg, nao, nao) dat = _ao2mo.nr_e2(eri, orbv, (0, nvir, 0, nvir), 's1', 's1', out=bufw) bsave( i0, i1, dat.reshape(i1 - i0, nocc_seg, nvir, nvir).transpose(0, 2, 1, 3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris
def half_e1(mol, mo_coeffs, swapfile, intor='cint2e_sph', aosym='s4', comp=1, max_memory=2000, ioblk_size=IOBLK_SIZE, verbose=logger.WARN, compact=True, ao2mopt=None): r'''Half transform arbitrary spherical AO integrals to MO integrals for the given two sets of orbitals Args: mol : :class:`Mole` object AO integrals will be generated in terms of mol._atm, mol._bas, mol._env mo_coeff : ndarray Transform (ij|kl) with the same set of orbitals. swapfile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. The transformed integrals are saved in blocks. Kwargs intor : str Name of the 2-electron integral. Ref to :func:`getints_by_shell` for the complete list of available 2-electron integral names aosym : int or str Permutation symmetry for the AO integrals | 4 or '4' or 's4': 4-fold symmetry (default) | '2ij' or 's2ij' : symmetry between i, j in (ij|kl) | '2kl' or 's2kl' : symmetry between k, l in (ij|kl) | 1 or '1' or 's1': no symmetry | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO) | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO) | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO) | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO) comp : int Components of the integrals, e.g. cint2e_ip_sph has 3 components. verbose : int Print level max_memory : float or int The maximum size of cache to use (in MB), large cache may **not** improve performance. ioblk_size : float or int The block size for IO, large block size may **not** improve performance verbose : int Print level compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals ao2mopt : :class:`AO2MOpt` object Precomputed data to improve perfomance Returns: None ''' time0 = (time.clock(), time.time()) if isinstance(verbose, logger.Logger): log = verbose else: log = logger.Logger(mol.stdout, verbose) nao = mo_coeffs[0].shape[0] aosym = _stand_sym_code(aosym) if aosym in ('s4', 's2ij'): nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao ijmosym, nij_pair, moij, ijshape = \ incore._conc_mos(mo_coeffs[0], mo_coeffs[1], compact and aosym in ('s4', 's2ij')) e1buflen, mem_words, iobuf_words, ioblk_words = \ guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp) # The buffer to hold AO integrals in C code, see line (@) aobuflen = int((mem_words - iobuf_words) // (nao_pair*comp)) shranges = guess_shell_ranges(mol, (aosym in ('s4', 's2kl')), e1buflen, aobuflen) if ao2mopt is None: if intor == 'cint2e_sph': ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') else: ao2mopt = _ao2mo.AO2MOpt(mol, intor) if isinstance(swapfile, str): fswap = h5py.File(swapfile, 'w') else: fswap = swapfile for icomp in range(comp): g = fswap.create_group(str(icomp)) # for h5py old version log.debug('step1: tmpfile %s %.8g MB', fswap.filename, nij_pair*nao_pair*8/1e6) log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB', nij_pair, nao_pair, mem_words*8/1e6, iobuf_words*8/1e6) # transform e1 ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0) nstep = len(shranges) maxbuflen = max([x[2] for x in shranges]) bufs1 = numpy.empty((comp*maxbuflen,nao_pair)) bufs2 = numpy.empty((comp*maxbuflen,nij_pair)) for istep,sh_range in enumerate(shranges): log.debug1('step 1 [%d/%d], AO [%d:%d], len(buf) = %d', \ istep+1, nstep, *(sh_range[:3])) buflen = sh_range[2] iobuf = bufs2[:comp*buflen].reshape(comp,buflen,nij_pair) nmic = len(sh_range[3]) p0 = 0 for imic, aoshs in enumerate(sh_range[3]): log.debug2(' fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d', \ imic+1, nmic, *aoshs) buf = bufs1[:comp*aoshs[2]] # (@) _ao2mo.nr_e1fill(intor, aoshs, mol._atm, mol._bas, mol._env, aosym, comp, ao2mopt, out=buf) buf = _ao2mo.nr_e1(buf, moij, ijshape, aosym, ijmosym) iobuf[:,p0:p0+aoshs[2]] = buf.reshape(comp,aoshs[2],-1) p0 += aoshs[2] ti2 = log.timer_debug1('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0) e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, buflen) for icomp in range(comp): _transpose_to_h5g(fswap, '%d/%d'%(icomp,istep), iobuf[icomp], e2buflen, None) ti0 = log.timer_debug1('transposing to disk', *ti2) bufs1 = bufs2 = None if isinstance(swapfile, str): fswap.close() return swapfile
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nao, nocc = orbo.shape nvir = orbv.shape[1] nbas = mol.nbas assert (nvir <= nao) ao_loc = mol.ao_loc_nr() dmax = max( 4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocc)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc**2 * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) buf_i = numpy.empty((nocc * dmax**2 * nao)) buf_li = numpy.empty((nocc**2 * dmax**2)) buf1 = numpy.empty_like(buf_li) fint = gto.moleintor.getints4c jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0, nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = numpy.ndarray((nocc, (i1 - i0) * (j1 - j0) * nao), buffer=buf_i) tmp_li = numpy.ndarray((nocc, nocc * (i1 - i0) * (j1 - j0)), buffer=buf_li) lib.ddot(orbo.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao), c=tmp_i) lib.ddot(orbo.T, tmp_i.reshape(nocc * (i1 - i0) * (j1 - j0), nao).T, c=tmp_li) tmp_li = tmp_li.reshape(nocc, nocc, (i1 - i0), (j1 - j0)) save(str(count), tmp_li.transpose(1, 0, 2, 3)) buf_li, buf1 = buf1, buf_li count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None chunks = (nvir, nvir) h5dat = feri.create_dataset('ovov', (nocc * nvir, nocc * nvir), 'f8', chunks=chunks) # jk_where is the sorting indices for the stacked (oO|pP) integrals in pass 2 jk_where = [] aoao_idx = numpy.arange(nao * nao).reshape(nao, nao) for i0, i1, j0, j1 in jk_blk_slices: # idx of pP in <oO|pP> jk_where.append(aoao_idx[i0:i1, j0:j1].ravel()) if i0 != j0: # idx of pP in (<oO|pP>).transpose(1,0,3,2) jk_where.append(aoao_idx[j0:j1, i0:i1].ravel()) jk_where = numpy.argsort(numpy.hstack(jk_where)).astype(numpy.int32) orbv = numpy.asarray(orbv, order='F') occblk = int( min(nocc, max(4, 250 / nocc, max_memory * .9e6 / 8 / (nao**2 * nocc) / 3))) def load(i0, eri): if i0 >= nocc: return i1 = min(i0 + occblk, nocc) eri = eri[:(i1 - i0) * nocc] p1 = 0 for k, jk_slice in enumerate(jk_blk_slices): dat = numpy.asarray(ftmp[str(k)][i0:i1]).reshape((i1 - i0) * nocc, -1) p0, p1 = p1, p1 + dat.shape[1] eri[:, p0:p1] = dat if jk_slice[0] != jk_slice[2]: dat = numpy.asarray(ftmp[str(k)][:, i0:i1]) dat = dat.transpose(1, 0, 3, 2).reshape((i1 - i0) * nocc, -1) p0, p1 = p1, p1 + dat.shape[1] eri[:, p0:p1] = dat def save(i0, i1, dat): for i in range(i0, i1): h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape( nvir, nocc * nvir) buf_prefecth = numpy.empty((occblk * nocc, nao**2)) buf = numpy.empty_like(buf_prefecth) buf1 = numpy.empty_like(buf_prefecth) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf eri = buf[:(i1 - i0) * nocc] prefetch(i1, buf_prefecth) idx = numpy.arange(eri.shape[0], dtype=numpy.int32) dat = lib.take_2d(eri, idx, jk_where, out=buf1) dat = _ao2mo.nr_e2(dat, orbv, (0, nvir, 0, nvir), 's1', 's1', out=eri) bsave( i0, i1, dat.reshape(i1 - i0, nocc, nvir, nvir).transpose(0, 2, 1, 3)) time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) return h5dat