def CreateDataByStationAndInit(GridSize, DateBegin, DateEnd, PredictionWindow,
                               ListParam, WithTopo, TopoListParam, isLocal,
                               n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    assert GridSize % 2 == 1, 'Grid size must be an odd number.'

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    station_ids = OBS['station_id'].data
    OBS.close()

    station_paths = []
    for S in station_ids:
        # prepare output folders for each station
        station_paths += [DESTINATION + '/Station_' + str(S)]
        if not os.path.exists(station_paths[-1]):
            os.makedirs(station_paths[-1])

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION,
                                          'StationAndInit', DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]

    # take timestamp after set-up
    time_setup = time()

    with Pool(processes=n_parallel) as pool:
        # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
        process_results = []
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            # only calculate topo data by the first process, since it is invariant
            if idx_split == 0:
                isTopo = WithTopo
            else:
                isTopo = 0

            process_results.append(
                pool.apply_async(
                    GetData, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                              DESTINATION, ListParam, TopoListParam, GridSize,
                              isTopo, split, PredictionWindow, isLocal)))

        # forces the parent process to wait on all forked children processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            _ = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'grid_size': GridSize,
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'parameters': ListParam,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Preprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))
예제 #2
0
def CreateBaselineData(DateBegin, DateEnd, PredictionWindow, isLocal,
                       n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/baseline'  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/baseline'  # target directory for all generated files

    if not os.path.exists(DESTINATION):
        os.makedirs(DESTINATION)

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc')
    station_ids = OBS['station_id'].data

    # extract time invariant features for each station and the corresponding sub-grid
    if not os.path.exists(DESTINATION + '/station_neighbors.pkl'):

        station_neighbors = {}

        # calculate for each station the neighbors on the grid in parallel
        with Pool(processes=n_parallel) as pool:
            process_results = []

            gridHeightData = TOPO.HH.data
            gridLatData = TOPO.lat.data
            gridLonData = TOPO.lon.data

            # start a new process with the work function for each data split
            for idx_S, S in enumerate(station_ids):
                # calculate height difference between grid points and station
                station_height = OBS['height'].sel(station_id=S).data
                station_lat = OBS['lat'].sel(station_id=S).data
                station_lon = OBS['lon'].sel(station_id=S).data

                print('Neighborhood calculation for staiton %s queued.' % S)
                process_results.append(
                    pool.apply_async(
                        getStationNeighbors,
                        (S, gridHeightData, gridLatData, gridLonData,
                         station_height, station_lat, station_lon)))

            # aggregate results from all processes
            for ps_idx, ps_result in enumerate(process_results):
                # sync processes
                S, neighbor_data = ps_result.get()
                station_neighbors[S] = neighbor_data
                print('[Process %s] Synchronized after data creation.' %
                      ps_idx)

        with open(DESTINATION + '/station_neighbors.pkl', 'wb') as handle:
            pkl.dump(station_neighbors, handle, protocol=pkl.HIGHEST_PROTOCOL)
        print(
            'Station time invariant features have been calculated and stored.')
    else:
        with open(DESTINATION + '/station_neighbors.pkl', 'rb') as handle:
            station_neighbors = pkl.load(handle)
        print(
            'Station time invariant features have been found on disk and were therefore not created again.'
        )

    OBS.close()
    TOPO.close()

    for S in station_ids:
        temp_output_path = DESTINATION + '/temp/station_%s' % S
        if not os.path.exists(temp_output_path):
            os.makedirs(temp_output_path)

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'Station',
                                          DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]
    # take timestamp after set-up
    time_setup = time()

    # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
    # calculate min/max and select samples on data split in parallel
    with Pool(processes=n_parallel) as pool:
        process_results = []

        # start a new process with the work function for each data split
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            process_results.append(
                pool.apply_async(
                    GetDataWrapper,
                    (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                     DESTINATION, split, station_neighbors, PredictionWindow,
                     isLocal)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

        station_folders_paths = [
            f for f in os.listdir(DESTINATION + '/temp')
            if re.match(r'^station_([0-9]+?)$', f)
        ]

        process_results = []
        for ps_idx, station_folder in enumerate(station_folders_paths):
            print('Process %s with station folder %s queued.' %
                  (ps_idx, station_folder))
            process_results.append(
                pool.apply_async(aggregateProcessFiles,
                                 (ps_idx, DESTINATION, station_folder)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after aggregation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Station baseline reprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))