Пример #1
0
def load_time_domain(nsteps=None):
    """Load the time domain corresponding to the GEMS simulation data.

    Parameters
    ----------
    nsteps : int or None
        How many entries to extract from the time domain (default all).
    """
    # Locate the data.
    data_path = _checkexists(config.gems_data_path())

    # Extract the data.
    with timed_block(f"Loading time domain data from {data_path}"):
        with h5py.File(data_path, 'r') as hf:
            time_domain = hf["time"][:nsteps]

    # Check time spacing.
    if not np.allclose(np.diff(time_domain), config.DT):
        raise ValueError("time domain DT != config.DT")

    # If a larger number of time steps is requested, use np.linspace().
    if np.isscalar(nsteps) and time_domain.size < nsteps:
        t0 = time_domain[0]
        return np.linspace(t0, t0 + nsteps * config.DT, nsteps)

    return time_domain
Пример #2
0
def load_gems_data(rows=None, cols=None):
    """Load the indicated rows and colums of GEMS simulation data.
    This is a large file, so try to only load what is needed at the moment.

    Parameters
    ----------
    rows : int, slice, or (nrows,) ndarray of integer indices
        Which rows (spatial locations) to extract from the data (default all).
        If an integer, extract the first `rows` rows.

    cols : int or slice
        Which columns (temporal points) to extract from the data (default all).
        If an integer, extract the first `cols` columns.

    Returns
    -------
    gems_data : (nrows,ncols) ndarray
        The indicated rows / columns of the data.

    time_domain : (ncols,) ndarray
        The time (in seconds) associated with each column of extracted data.
    """
    # Locate the data.
    data_path = _checkexists(config.gems_data_path())

    # Ensure rows are loaded in ascending index order (HDF5 requirement).
    if isinstance(rows, (np.ndarray, list)):
        row_order = np.argsort(rows)
        rows = np.array(rows, copy=True)[row_order]
        old_row_order = np.argsort(row_order)
    elif np.isscalar(rows) or rows is None:
        rows = slice(None, rows)
    if np.isscalar(cols) or cols is None:
        cols = slice(None, cols)

    # Extract the data.
    NROWS = config.NUM_GEMSVARS * config.DOF
    with timed_block(f"Loading GEMS simulation data from {data_path}"):
        with h5py.File(data_path, 'r') as hf:
            # Check data shape.
            if hf["data"].shape[0] != NROWS:
                raise RuntimeError(f"data should have exactly {NROWS} rows")
            gems_data = hf["data"][rows, cols]
            time_domain = hf["time"][cols]

    # Restore row ordering if needed.
    if isinstance(rows, np.ndarray):
        gems_data = gems_data[old_row_order, :]

    return gems_data, time_domain
def _read_tar_and_save_data(tfile, start, stop, parallel=True):
    """Read snapshot data directly from a .tar archive (without untar-ing it)
    and copy the data to the snapshot matrix HDF5 file config.GEMS_DATA_FILE.

    Parameters
    ----------
    tfile : str
        Name of a .tar file to read data from.

    start : int
        Index of the first snapshot contained in the .tar file.

    stop : int
        Index of the last snapshot contained in the .tar file.

    parallel : bool
        If True, then only print progress if start == 0 and lock / unlock
        when writing to the HDF5 file.
    """
    # Allocate space for the snapshots in this .tar file.
    num_snapshots = stop - start
    gems_data = np.empty((config.DOF*config.NUM_GEMSVARS, num_snapshots),
                         dtype=np.float64)
    times = np.empty(num_snapshots, dtype=np.float64)

    # Extract the data from the .tar file.
    with tarfile.open(tfile, 'r') as archive:
        for j,tarinfo in enumerate(archive):

            # Read the contents of one file.
            with archive.extractfile(tarinfo) as datfile:
                contents = datfile.read().decode()

            # Get the simulation time from the file name.
            simtime = float(_SIMTIME.findall(tarinfo.name)[0]) * config.DT

            # Parse and verify the header.
            header_end = _HEADEREND.findall(contents)[0]
            headersize = contents.find(header_end) + len(header_end)
            if int(_ELEMENTS.findall(contents[:headersize])[0]) != config.DOF:
                raise RuntimeError(f"{tarinfo.name} DOF != config.DOF")

            # Extract and store the variable data.
            data = contents[headersize:].split()[:gems_data.shape[0]],
            gems_data[:,j] = np.array(data, dtype=np.float64)
            times[j] = simtime
            if start == 0 or not parallel:
                print(f"\rProcessed file {j+1:05d}/{num_snapshots}",
                      end='', flush=True)
    if start == 0 or not parallel:
        print()

    # Save the data to the appropriate slice.
    save_path = config.gems_data_path()
    if parallel:
        lock.acquire()  # Only allow one process to open the file at a time.
    with utils.timed_block(f"Saving snapshots {start}-{stop} to HDF5"):
        with h5py.File(save_path, 'a') as hf:
            hf["data"][:,start:stop] = gems_data
            hf["time"][  start:stop] = times
    print(f"Data saved to {save_path}.")
    if parallel:
        lock.release()  # Let other processes resume.
def main(data_folder, overwrite=False, serial=False):
    """Extract snapshot data, in parallel, from the .tar files in the
    specified folder of the form Data_<first-snapshot>to<last-snapshot>.tar.

    Parameters
    ----------
    data_folder : str
        Path to the folder that contains the raw GEMS .tar data files,
        preferably as an absolute path (e.g., /path/to/folder).

    overwrite : bool
        If False and the snapshot matrix file exists, raise an error.
        If True, overwrite the existing snapshot matrix file if it exists.

    serial : bool
        If True, do the unpacking sequentially in 10,000 snapshot chunks.
        If False, do the unpacking in parallel with 10,000 snapshot chunks.
    """
    utils.reset_logger()

    # If it exists, copy the grid file to the Tecplot data directory.
    source = os.path.join(data_folder, config.GRID_FILE)
    if os.path.isfile(source):
        target = config.grid_data_path()
        with utils.timed_block(f"Copying {source} to {target}"):
            shutil.copy(source, target)
    else:
        logging.warning(f"Grid file {source} not found!")

    # Locate and sort raw .tar files.
    target_pattern = os.path.join(data_folder, "Data_*to*.tar")
    tarfiles = sorted(glob.glob(target_pattern))
    if not tarfiles:
        raise FileNotFoundError(target_pattern)

    # Get the snapshot indices corresponding to each file from the file names.
    starts, stops = [], []
    for i,tfile in enumerate(tarfiles):
        matches = re.findall(r"Data_(\d+)to(\d+).tar", tfile)
        if not matches:
            raise ValueError(f"file {tfile} not named with convention "
                             "Data_<first-snapshot>to<last-snapshot>.tar")
        start, stop = [int(d) for d in matches[0]]
        if i == 0:
            start0 = start  # Offset
        starts.append(start - start0)
        stops.append(stop + 1 - start0)

        if i > 0 and stops[i-1] != starts[i]:
            raise ValueError(f"file {tfile} not continuous from previous set")
    num_snapshots = stops[-1]

    # Create an empty HDF5 file of appropriate size for the data.
    save_path = config.gems_data_path()
    if os.path.isfile(save_path) and not overwrite:
        raise FileExistsError(f"{save_path} (use --overwrite to overwrite)")
    with utils.timed_block("Initializing HDF5 file for data"):
        with h5py.File(save_path, 'w') as hf:
            hf.create_dataset("data", shape=(config.DOF*config.NUM_GEMSVARS,
                                             num_snapshots),
                                      dtype=np.float64)
            hf.create_dataset("time", shape=(num_snapshots,),
                                      dtype=np.float64)
    logging.info(f"Data file initialized as {save_path}.")

    # Read the files in chunks.
    args = zip(tarfiles, starts, stops)
    if serial:       # Read the files serially (sequentially).
        for tf, start, stop in args:
            _read_tar_and_save_data(tf, start, stop, parallel=False)
    else:            # Read the files in parallel.
        with mp.Pool(initializer=_globalize_lock, initargs=(mp.Lock(),),
                     processes=min([len(tarfiles), mp.cpu_count()])) as pool:
            pool.starmap(_read_tar_and_save_data, args)