Exemplo n.º 1
0
    def test_unpack_row(self):
        row = numpy.arange(28.)
        ref = lib.unpack_tril(row)[4]
        self.assertTrue(numpy.array_equal(ref, lib.unpack_row(row, 4)))

        row = numpy.arange(28, dtype=numpy.int32)
        ref = lib.unpack_tril(row)[4]
        a = lib.unpack_row(row, 4)
        self.assertTrue(numpy.array_equal(ref, a))
        self.assertTrue(a.dtype == numpy.int32)
Exemplo n.º 2
0
    def test_unpack_row(self):
        row = numpy.arange(28.)
        ref = lib.unpack_tril(row)[4]
        self.assertTrue(numpy.array_equal(ref, lib.unpack_row(row, 4)))

        row = numpy.arange(28, dtype=numpy.int32)
        ref = lib.unpack_tril(row)[4]
        a = lib.unpack_row(row, 4)
        self.assertTrue(numpy.array_equal(ref, a))
        self.assertTrue(a.dtype == numpy.int32)
Exemplo n.º 3
0
def general(eri, mo_coeffs, erifile, dataname='eri_mo',
            ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    nmoi = mo_coeffs[0].shape[1]
    nmoj = mo_coeffs[1].shape[1]
    nmok = mo_coeffs[2].shape[1]
    nmol = mo_coeffs[3].shape[1]
    nao = mo_coeffs[0].shape[0]

    nao_pair = nao*(nao+1) // 2
    if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]):
        ij_red = False
        nij_pair = nmoi*(nmoi+1) // 2
    else:
        ij_red = True
        nij_pair = nmoi*nmoj
    if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]):
        kl_red = False
        nkl_pair = nmok*(nmok+1) // 2
    else:
        kl_red = True
        nkl_pair = nmok*nmol

    chunks_half = (max(1, numpy.minimum(int(ioblk_size//(nao_pair*f8_size)),nmoj)),
                   max(1, numpy.minimum(int(ioblk_size//(nij_pair*f8_size)),nmol)))
    '''
    ideally, the final transformed eris should have a chunk of nmoj x nmol to
    optimize read operations. However, I'm chunking the row size so that the
    write operations during the transform can be done as fast as possible.
    '''
    chunks_full = (numpy.minimum(int(ioblk_size//(nkl_pair*f8_size)),nmoj),nmol)

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile)
            if dataname in feri:
                del(feri[dataname])
        else:
            feri = h5py.File(erifile,'w',libver='latest')
    else:
        assert(isinstance(erifile, h5py.Group))
        feri = erifile
    h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair),'f8',chunks=chunks_full)

    feri_swap = lib.H5TmpFile(libver='latest')
    half_eri = feri_swap.create_dataset(dataname,(nij_pair,nao_pair),'f8',chunks=chunks_half)

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}'.format(ioblk_size))
    log.debug('  jxl {}x{}, half eri chunk dim  {}x{}'.format(nmoj,nmol,chunks_half[0],chunks_half[1]))
    log.debug('  jxl {}x{}, full eri chunk dim {}x{}'.format(nmoj,nmol,chunks_full[0],chunks_full[1]))
    log.debug('  Final disk eri size (MB): {:.3g}, chunked {:.3g}'
              .format(nij_pair*nkl_pair*f8_size,numpy.prod(chunks_full)*f8_size))
    log.debug('  Half transformed eri size (MB): {:.3g}, chunked {:.3g}'
              .format(nij_pair*nao_pair*f8_size,numpy.prod(chunks_half)*f8_size))
    log.debug('  RAM buffer for half transform (MB): {:.3g}'
             .format(nij_pair*chunks_half[1]*f8_size*2))
    log.debug('  RAM buffer for full transform (MB): {:.3g}'
             .format(f8_size*chunks_full[0]*nkl_pair*2 + chunks_half[0]*nao_pair*f8_size*2))

    def save1(piece,buf):
        start = piece*chunks_half[1]
        stop = (piece+1)*chunks_half[1]
        if stop > nao_pair:
            stop = nao_pair
        half_eri[:,start:stop] = buf[:,:stop-start]
        return

    def load2(piece):
        start = piece*chunks_half[0]
        stop = (piece+1)*chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        return half_eri[start:stop,:]

    def prefetch2(piece):
        start = piece*chunks_half[0]
        stop = (piece+1)*chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        buf_prefetch[:stop-start,:] = half_eri[start:stop,:]
        return

    def save2(piece,buf):
        start = piece*chunks_full[0]
        stop = (piece+1)*chunks_full[0]
        if stop > nij_pair:
            stop = nij_pair
        h5d_eri[start:stop,:] = buf[:stop-start,:]
        return

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    Cimu = mo_coeffs[0].conj().transpose()
    buf_write = numpy.empty((nij_pair,chunks_half[1]))
    buf_out = numpy.empty_like(buf_write)
    wpiece = 0
    with lib.call_in_background(save1) as async_write:
        for lo in range(nao_pair):
            if lo % chunks_half[1] == 0 and lo > 0:
                #save1(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece,buf_out)
                wpiece += 1
            buf = lib.unpack_row(eri,lo)
            uv = lib.unpack_tril(buf)
            uv = Cimu.dot(uv).dot(mo_coeffs[1])
            if ij_red:
                ij = numpy.ravel(uv) # grabs by row
            else:
                ij = lib.pack_tril(uv)
            buf_write[:,lo % chunks_half[1]] = ij
    # final write operation & cleanup
    save1(wpiece,buf_write)
    log.timer('(uv|lo) -> (ij|lo)', *cput0)
    uv = None
    ij = None
    buf = None

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj().transpose()
    buf_write = numpy.empty((chunks_full[0],nkl_pair))
    buf_out = numpy.empty_like(buf_write)
    buf_read = numpy.empty((chunks_half[0],nao_pair))
    buf_prefetch = numpy.empty_like(buf_read)
    rpiece = 0
    wpiece = 0
    with lib.call_in_background(save2,prefetch2) as (async_write,prefetch):
        buf_read = load2(rpiece)
        prefetch(rpiece+1)
        for ij in range(nij_pair):
            if ij % chunks_full[0] == 0 and ij > 0:
                #save2(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece,buf_out)
                wpiece += 1
            if ij % chunks_half[0] == 0 and ij > 0:
                #buf_read = load2(rpiece)
                buf_read, buf_prefetch = buf_prefetch, buf_read
                rpiece += 1
                prefetch(rpiece+1)
            lo = lib.unpack_tril(buf_read[ij % chunks_half[0],:])
            lo = Cklam.dot(lo).dot(mo_coeffs[3])
            if kl_red:
                kl = numpy.ravel(lo)
            else:
                kl = lib.pack_tril(lo)
            buf_write[ij % chunks_full[0],:] = kl
    save2(wpiece,buf_write)
    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile
Exemplo n.º 4
0
def general(eri,
            mo_coeffs,
            erifile,
            dataname='eri_mo',
            ioblk_size=IOBLK_SIZE,
            compact=True,
            verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    nmoi = mo_coeffs[0].shape[1]
    nmoj = mo_coeffs[1].shape[1]
    nmok = mo_coeffs[2].shape[1]
    nmol = mo_coeffs[3].shape[1]
    nao = mo_coeffs[0].shape[0]

    nao_pair = nao * (nao + 1) // 2
    if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]):
        ij_red = False
        nij_pair = nmoi * (nmoi + 1) // 2
    else:
        ij_red = True
        nij_pair = nmoi * nmoj
    if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]):
        kl_red = False
        nkl_pair = nmok * (nmok + 1) // 2
    else:
        kl_red = True
        nkl_pair = nmok * nmol

    chunks_half = (max(
        1, numpy.minimum(int(ioblk_size // (nao_pair * f8_size)), nmoj)),
                   max(
                       1,
                       numpy.minimum(int(ioblk_size // (nij_pair * f8_size)),
                                     nmol)))
    '''
    ideally, the final transformed eris should have a chunk of nmoj x nmol to
    optimize read operations. However, I'm chunking the row size so that the
    write operations during the transform can be done as fast as possible.
    '''
    chunks_full = (numpy.minimum(int(ioblk_size // (nkl_pair * f8_size)),
                                 nmoj), nmol)

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile)
            if dataname in feri:
                del (feri[dataname])
        else:
            feri = h5py.File(erifile, 'w', libver='latest')
    else:
        assert (isinstance(erifile, h5py.Group))
        feri = erifile
    h5d_eri = feri.create_dataset(dataname, (nij_pair, nkl_pair),
                                  'f8',
                                  chunks=chunks_full)

    feri_swap = lib.H5TmpFile(libver='latest')
    half_eri = feri_swap.create_dataset(dataname, (nij_pair, nao_pair),
                                        'f8',
                                        chunks=chunks_half)

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}'.format(ioblk_size))
    log.debug('  jxl {}x{}, half eri chunk dim  {}x{}'.format(
        nmoj, nmol, chunks_half[0], chunks_half[1]))
    log.debug('  jxl {}x{}, full eri chunk dim {}x{}'.format(
        nmoj, nmol, chunks_full[0], chunks_full[1]))
    log.debug('  Final disk eri size (MB): {:.3g}, chunked {:.3g}'.format(
        nij_pair * nkl_pair * f8_size,
        numpy.prod(chunks_full) * f8_size))
    log.debug(
        '  Half transformed eri size (MB): {:.3g}, chunked {:.3g}'.format(
            nij_pair * nao_pair * f8_size,
            numpy.prod(chunks_half) * f8_size))
    log.debug('  RAM buffer for half transform (MB): {:.3g}'.format(
        nij_pair * chunks_half[1] * f8_size * 2))
    log.debug('  RAM buffer for full transform (MB): {:.3g}'.format(
        f8_size * chunks_full[0] * nkl_pair * 2 +
        chunks_half[0] * nao_pair * f8_size * 2))

    def save1(piece, buf):
        start = piece * chunks_half[1]
        stop = (piece + 1) * chunks_half[1]
        if stop > nao_pair:
            stop = nao_pair
        half_eri[:, start:stop] = buf[:, :stop - start]
        return

    def load2(piece):
        start = piece * chunks_half[0]
        stop = (piece + 1) * chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        return half_eri[start:stop, :]

    def prefetch2(piece):
        start = piece * chunks_half[0]
        stop = (piece + 1) * chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        buf_prefetch[:stop - start, :] = half_eri[start:stop, :]
        return

    def save2(piece, buf):
        start = piece * chunks_full[0]
        stop = (piece + 1) * chunks_full[0]
        if stop > nij_pair:
            stop = nij_pair
        h5d_eri[start:stop, :] = buf[:stop - start, :]
        return

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    Cimu = mo_coeffs[0].conj().transpose()
    buf_write = numpy.empty((nij_pair, chunks_half[1]))
    buf_out = numpy.empty_like(buf_write)
    wpiece = 0
    with lib.call_in_background(save1) as async_write:
        for lo in range(nao_pair):
            if lo % chunks_half[1] == 0 and lo > 0:
                #save1(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece, buf_out)
                wpiece += 1
            buf = lib.unpack_row(eri, lo)
            uv = lib.unpack_tril(buf)
            uv = Cimu.dot(uv).dot(mo_coeffs[1])
            if ij_red:
                ij = numpy.ravel(uv)  # grabs by row
            else:
                ij = lib.pack_tril(uv)
            buf_write[:, lo % chunks_half[1]] = ij
    # final write operation & cleanup
    save1(wpiece, buf_write)
    log.timer('(uv|lo) -> (ij|lo)', *cput0)
    uv = None
    ij = None
    buf = None

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj().transpose()
    buf_write = numpy.empty((chunks_full[0], nkl_pair))
    buf_out = numpy.empty_like(buf_write)
    buf_read = numpy.empty((chunks_half[0], nao_pair))
    buf_prefetch = numpy.empty_like(buf_read)
    rpiece = 0
    wpiece = 0
    with lib.call_in_background(save2, prefetch2) as (async_write, prefetch):
        buf_read = load2(rpiece)
        prefetch(rpiece + 1)
        for ij in range(nij_pair):
            if ij % chunks_full[0] == 0 and ij > 0:
                #save2(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece, buf_out)
                wpiece += 1
            if ij % chunks_half[0] == 0 and ij > 0:
                #buf_read = load2(rpiece)
                buf_read, buf_prefetch = buf_prefetch, buf_read
                rpiece += 1
                prefetch(rpiece + 1)
            lo = lib.unpack_tril(buf_read[ij % chunks_half[0], :])
            lo = Cklam.dot(lo).dot(mo_coeffs[3])
            if kl_red:
                kl = numpy.ravel(lo)
            else:
                kl = lib.pack_tril(lo)
            buf_write[ij % chunks_full[0], :] = kl
    save2(wpiece, buf_write)
    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile
Exemplo n.º 5
0
def general(eri, mo_coeffs, erifile, dataname='eri_mo',
            ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    eri_ao = numpy.asarray(eri, order='C')
    nao, nmoi = mo_coeffs[0].shape
    nmoj = mo_coeffs[1].shape[1]
    nao_pair = nao*(nao+1)//2
    ijmosym, nij_pair, moij, ijshape = _conc_mos(mo_coeffs[0], mo_coeffs[1], compact)
    klmosym, nkl_pair, mokl, klshape = _conc_mos(mo_coeffs[2], mo_coeffs[3], compact)
    ijshape = (ijshape[0], ijshape[1]-ijshape[0],
               ijshape[2], ijshape[3]-ijshape[2])
    dtype = numpy.result_type(eri, *mo_coeffs)
    typesize = dtype.itemsize/1e6 # in MB

    if nij_pair == 0:
        return numpy.empty((nij_pair,nkl_pair))

    ij_red = ijmosym == 's1'
    kl_red = klmosym == 's1'

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile, 'a')
            if dataname in feri:
                del(feri[dataname])
        else:
            feri = h5py.File(erifile,'w',libver='latest')
    else:
        assert(isinstance(erifile, h5py.Group))
        feri = erifile

    h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair), dtype.char)
    feri_swap = lib.H5TmpFile(libver='latest')
    chunk_size = min(nao_pair, max(4, int(ioblk_size*1e6/8/nao_pair)))

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}  chunk_size: {}'
              .format(ioblk_size, chunk_size))
    log.debug('  Final disk eri size (MB): {:.3g}'
              .format(nij_pair*nkl_pair*typesize))
    log.debug('  Half transformed eri size (MB): {:.3g}'
              .format(nij_pair*nao_pair*typesize))
    log.debug('  RAM buffer (MB): {:.3g}'
             .format(nij_pair*IOBLK_SIZE*typesize*2))

    if eri_ao.size == nao_pair**2: # 4-fold symmetry
        # half_e1 first transforms the indices which are contiguous in memory
        # transpose the 4-fold integrals to make ij the contiguous indices
        eri_ao = lib.transpose(eri_ao)
        ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s4
    elif eri_ao.size == nao_pair*(nao_pair+1)//2:
        ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s8
    else:
        raise NotImplementedError

    if ijmosym == 's2':
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_s2
    elif nmoi <= nmoj:
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_iltj
    else:
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_igtj
    fdrv = getattr(_ao2mo.libao2mo, 'AO2MOnr_e1incore_drv')

    def save(piece, buf):
        feri_swap[str(piece)] = buf.T

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    with lib.call_in_background(save) as async_write:
        for istep, (p0, p1) in enumerate(lib.prange(0, nao_pair, chunk_size)):
            if dtype == numpy.double:
                buf = numpy.empty((p1-p0, nij_pair))
                fdrv(ftrans, fmmm,
                     buf.ctypes.data_as(ctypes.c_void_p),
                     eri_ao.ctypes.data_as(ctypes.c_void_p),
                     moij.ctypes.data_as(ctypes.c_void_p),
                     ctypes.c_int(p0), ctypes.c_int(p1-p0),
                     ctypes.c_int(nao),
                     ctypes.c_int(ijshape[0]), ctypes.c_int(ijshape[1]),
                     ctypes.c_int(ijshape[2]), ctypes.c_int(ijshape[3]))
            else:  # complex
                tmp = numpy.empty((p1-p0, nao_pair))
                if eri_ao.size == nao_pair**2: # 4-fold symmetry
                    tmp = eri_ao[p0:p1]
                else: # 8-fold symmetry
                    for i in range(p0, p1):
                        tmp[i-p0] = lib.unpack_row(eri_ao, i)
                tmp = lib.unpack_tril(tmp, filltriu=lib.SYMMETRIC)
                buf = lib.einsum('xpq,pi,qj->xij', tmp, mo_coeffs[0].conj(), mo_coeffs[1])
                if ij_red:
                    buf = buf.reshape(p1-p0,-1) # grabs by row
                else:
                    buf = lib.pack_tril(buf)

            async_write(istep, buf)

    log.timer('(uv|lo) -> (ij|lo)', *cput0)

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj()
    buf_read = numpy.empty((chunk_size,nao_pair), dtype=dtype)
    buf_prefetch = numpy.empty_like(buf_read)

    def load(start, stop, buf):
        if start < stop:
            _load_from_h5g(feri_swap, start, stop, buf)

    def save(start, stop, buf):
        if start < stop:
            h5d_eri[start:stop] = buf[:stop-start]

    with lib.call_in_background(save,load) as (async_write, prefetch):
        for p0, p1 in lib.prange(0, nij_pair, chunk_size):
            if p0 == 0:
                load(p0, p1, buf_prefetch)

            buf_read, buf_prefetch = buf_prefetch, buf_read
            prefetch(p1, min(p1+chunk_size, nij_pair), buf_prefetch)

            lo = lib.unpack_tril(buf_read[:p1-p0], filltriu=lib.SYMMETRIC)
            lo = lib.einsum('xpq,pi,qj->xij', lo, Cklam, mo_coeffs[3])
            if kl_red:
                kl = lo.reshape(p1-p0,-1)
            else:
                kl = lib.pack_tril(lo)
            async_write(p0, p1, kl)

    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile