Пример #1
0
def compute(dataset,
            calc,
            calc_grouping,
            tile_dimension,
            verbose=False,
            prefix=None):
    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise (ValueError('"tile_dimension" must be greater than 0'))

    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC
    ocgis.env.OPTIMIZE_FOR_CALC = True
    try:
        ods = NcDataset(request_dataset=dataset)
        shp = ods.spatial.grid.shape
        if verbose: print('getting schema...')
        schema = tile.get_tile_schema(shp[0], shp[1], tile_dimension)
        if verbose: print('getting fill file...')
        fill_file = ocgis.OcgOperations(dataset=dataset,
                                        file_only=True,
                                        calc=calc,
                                        calc_grouping=calc_grouping,
                                        output_format='nc',
                                        prefix=prefix).execute()
        if verbose: print('output file is: {0}'.format(fill_file))
        if verbose:
            lschema = len(schema)
            print('tile count: {0}'.format(lschema))
        fds = nc.Dataset(fill_file, 'a')
        if verbose:
            progress = ProgressBar('tiles progress')
        for ctr, indices in enumerate(schema.itervalues(), start=1):
            row = indices['row']
            col = indices['col']
            ret = ocgis.OcgOperations(dataset=dataset,
                                      slice=[None, row, col],
                                      calc=calc,
                                      calc_grouping=calc_grouping).execute()
            for variable in ret[1].variables.iterkeys():
                ref = ret[1].calc[variable]
                for k, v in ref.iteritems():
                    vref = fds.variables[k]
                    if len(vref.shape) == 3:
                        vref[:, row[0]:row[1], col[0]:col[1]] = v
                    elif len(vref.shape) == 4:
                        vref[:, :, row[0]:row[1], col[0]:col[1]] = v
                    else:
                        raise (NotImplementedError)
                    fds.sync()
            if verbose:
                progress.progress(int((float(ctr) / lschema) * 100))
        fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print('complete.')
    return (fill_file)
Пример #2
0
def compute(dataset,calc,calc_grouping,tile_dimension,verbose=False,prefix=None):
    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise(ValueError('"tile_dimension" must be greater than 0'))
    
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC
    ocgis.env.OPTIMIZE_FOR_CALC = True
    try:
        ods = NcDataset(request_dataset=dataset)
        shp = ods.spatial.grid.shape
        if verbose: print('getting schema...')
        schema = tile.get_tile_schema(shp[0],shp[1],tile_dimension)
        if verbose: print('getting fill file...')
        fill_file = ocgis.OcgOperations(dataset=dataset,file_only=True,
                                      calc=calc,calc_grouping=calc_grouping,
                                      output_format='nc',prefix=prefix).execute()
        if verbose: print('output file is: {0}'.format(fill_file))
        if verbose:
            lschema = len(schema)
            print('tile count: {0}'.format(lschema))
        fds = nc.Dataset(fill_file,'a')
        if verbose:
            progress = ProgressBar('tiles progress')
        for ctr,indices in enumerate(schema.itervalues(),start=1):
            row = indices['row']
            col = indices['col']
            ret = ocgis.OcgOperations(dataset=dataset,slice=[None,row,col],
                                calc=calc,calc_grouping=calc_grouping).execute()
            for variable in ret[1].variables.iterkeys():
                ref = ret[1].calc[variable]
                for k,v in ref.iteritems():
                    vref = fds.variables[k]
                    if len(vref.shape) == 3:
                        vref[:,row[0]:row[1],col[0]:col[1]] = v
                    elif len(vref.shape) == 4:
                        vref[:,:,row[0]:row[1],col[0]:col[1]] = v
                    else:
                        raise(NotImplementedError)
                    fds.sync()
            if verbose:
                progress.progress(int((float(ctr)/lschema)*100))
        fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print('complete.')
    return(fill_file)
Пример #3
0
def main():
    ocgis.env.OPTIMIZE_FOR_CALC = True
#    215 minutes
    uri = '/tmp/gridded_obs.tasmax.OBS_125deg.daily.1950-1999.nc'
#    uri = '/usr/local/climate_data/maurer/bcca/obs/tasmax/1_8deg/gridded_obs.tasmax.OBS_125deg.daily.1950.nc'
#        uri = '/usr/local/climate_data/daymet/tmax.nc'
    variable = 'tasmax'
#        variable = 'tmax'
    rd = ocgis.RequestDataset(uri,variable)
    import netCDF4 as nc
    ods = ocgis.api.dataset.dataset.OcgDataset(rd)
    shp = ods.i.spatial.shape
    print('getting schema...')
    schema = tile.get_tile_schema(shp[0],shp[1],100)
    calc = [{'func':'mean','name':'my_mean'},
            {'func':'freq_perc','name':'perc_90','kwds':{'perc':90,}},
            {'func':'freq_perc','name':'perc_95','kwds':{'perc':95,}},
            {'func':'freq_perc','name':'perc_99','kwds':{'perc':99,}}
           ]
    print('getting fill file...')
    fill_file = ocgis.OcgOperations(dataset=rd,file_only=True,
                                  calc=calc,calc_grouping=['month'],
                                  output_format='nc').execute()
    print fill_file, len(schema)
    fds = nc.Dataset(fill_file,'a')
    t1 = time.time()
    for tile_id,indices in schema.iteritems():
        print tile_id
        row = indices['row']
        col = indices['col']
        ret = ocgis.OcgOperations(dataset=rd,slice_row=row,slice_column=col,
                                  calc=calc,calc_grouping=['month'],
                                  abstraction='point').execute()
        ref = ret[1].variables[variable].calc_value
        for k,v in ref.iteritems():
            vref = fds.variables[k]
            if len(vref.shape) == 3:
                vref[:,row[0]:row[1],col[0]:col[1]] = v
            elif len(vref.shape) == 4:
                vref[:,:,row[0]:row[1],col[0]:col[1]] = v
            else:
                raise(NotImplementedError)
            fds.sync()
    fds.close()
    print((time.time()-t1)/60.0)
Пример #4
0
 def test_tile_sum(self):
     ntests = 1000
     for ii in range(ntests):
         nrow, ncol, tdim = [self.get_random_integer() for ii in range(3)]
         x = np.random.rand(nrow, ncol)
         y = np.empty((nrow, ncol), dtype=float)
         schema = tile.get_tile_schema(nrow, ncol, tdim)
         tidx = schema[0]
         row = tidx['row']
         col = tidx['col']
         self.assertTrue(np.all(x[row[0]:row[1], col[0]:col[1]] == x[0:tdim, 0:tdim]))
         running_sum = 0.0
         for value in schema.itervalues():
             row, col = value['row'], value['col']
             slice = x[row[0]:row[1], col[0]:col[1]]
             y[row[0]:row[1], col[0]:col[1]] = slice
             running_sum += slice.sum()
         self.assertAlmostEqual(running_sum, x.sum())
         self.assertTrue(np.all(x == y))
Пример #5
0
 def test_tile_sum(self):
     ntests = 1000
     for ii in range(ntests):
         nrow, ncol, tdim = [self.get_random_integer() for ii in range(3)]
         x = np.random.rand(nrow, ncol)
         y = np.empty((nrow, ncol), dtype=float)
         schema = tile.get_tile_schema(nrow, ncol, tdim)
         tidx = schema[0]
         row = tidx['row']
         col = tidx['col']
         self.assertTrue(np.all(x[row[0]:row[1], col[0]:col[1]] == x[0:tdim, 0:tdim]))
         running_sum = 0.0
         for value in schema.values():
             row, col = value['row'], value['col']
             slice = x[row[0]:row[1], col[0]:col[1]]
             y[row[0]:row[1], col[0]:col[1]] = slice
             running_sum += slice.sum()
         self.assertAlmostEqual(running_sum, x.sum())
         self.assertTrue(np.all(x == y))
Пример #6
0
def compute(ops, tile_dimension, verbose=False, use_optimizations=True):
    """
    Used for computations on large arrays where memory limitations are a consideration. It is is also useful for
    extracting data from a server that has limitations on the size of requested data arrays. This function creates an
    empty destination NetCDF file that is then filled by executing the operations on chunks of the requested
    target dataset(s) and filling the destination NetCDF file.

    :param ops: The target operations to tile. There must be a calculation associated with
     the operations.
    :type ops: :class:`ocgis.OcgOperations`
    :param int tile_dimension: The target tile/chunk dimension. This integer value must be greater than zero.
    :param bool verbose: If ``True``, print more verbose information to terminal.
    :param bool use_optimizations: If ``True``, cache :class:`Field` and :class:`TemporalGroupDimension` objects for
     reuse during tile iteration.
    :raises: AssertionError, ValuError
    :returns: Path to the output NetCDF file.
    :rtype: str

    >>> from ocgis import RequestDataset, OcgOperations
    >>> from ocgis.util.large_array import compute
    >>> rd = RequestDataset(uri='/path/to/file',variable='tas')
    >>> ops = OcgOperations(dataset=rd,calc=[{'func':'mean','name':'mean'}],output_format='nc')
    >>> ret = compute(ops, 25)
    """

    # validate arguments
    assert isinstance(ops, OcgOperations)
    assert ops.calc is not None
    assert ops.output_format == "nc"

    # ensure that progress is not showing 100% at first
    if ops.callback is not None:
        orgcallback = ops.callback

        def zeropercentagecallback(p, m):
            orgcallback(0.0, m)

        ops.callback = zeropercentagecallback

    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise (ValueError('"tile_dimension" must be greater than 0'))

    # determine if we are working with a multivariate function
    if OcgCalculationEngine._check_calculation_members_(ops.calc, AbstractMultivariateFunction):
        # only one multivariate calculation allowed
        assert len(ops.calc) == 1
        has_multivariate = True
    else:
        # only one calculation allowed
        assert len(ops.dataset) == 1
        has_multivariate = False

    # work on a copy of the operations to create the template file
    ops_file_only = deepcopy(ops)
    # we need the output to be file only for the first request
    ops_file_only.file_only = True
    # save the environment flag for calculation optimizations.
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC

    try:
        # tell the software we are optimizing for calculations
        ocgis.env.OPTIMIZE_FOR_CALC = True

        # first, write the template file
        if verbose:
            print("getting fill file...")
        fill_file = ops_file_only.execute()
        # if there is a geometry, we have to find the offset for the slice. we
        # also need to account for the subset mask.
        if ops.geom is not None:
            if verbose:
                print("geometry subset is present. calculating slice offsets...")
            ops_offset = deepcopy(ops)
            ops_offset.output_format = "numpy"
            ops_offset.calc = None
            ops_offset.agg_selection = True
            ops_offset.snippet = False
            coll = ops_offset.execute()

            for row in coll.get_iter_melted():
                # assert the values are not loaded...
                assert row["variable"]._value is None
                # assert only 3 or 4 dimensional data is being used
                assert row["field"].shape_as_dict["R"] == 1

            ref_spatial = coll[1][ops_offset.dataset.first().name].spatial
            try:
                row_offset = ref_spatial.grid.row._src_idx[0]
                col_offset = ref_spatial.grid.col._src_idx[0]
            except (AttributeError, TypeError):
                # Likely no row and column for a 2-dimensional grid.
                row_offset = ref_spatial.grid._src_idx["row"][0]
                col_offset = ref_spatial.grid._src_idx["col"][0]
            mask_spatial = ref_spatial.get_mask()
        # otherwise the offset is zero...
        else:
            row_offset = 0
            col_offset = 0
            mask_spatial = None

        # get the shape for the tile schema
        if verbose:
            print("getting tile schema shape inputs...")
        #        if has_multivariate == False:
        #            shp_variable = '{0}_{1}'.format(ops.calc[0]['name'],ops.dataset[0].alias)
        #        else:
        #            shp_variable = ops.calc[0]['name']
        shp_variable = ops.calc[0]["name"]
        template_rd = ocgis.RequestDataset(uri=fill_file, variable=shp_variable)
        template_field = template_rd.get()
        shp = template_field.shape[-2:]

        if use_optimizations:
            # if there is a calculation grouping, optimize for it. otherwise, pass
            # this value as None.
            try:
                tgd_field = ops.dataset.first().get()
                template_tgd = tgd_field.temporal.get_grouping(deepcopy(ops.calc_grouping))
                if not has_multivariate:
                    key = ops.dataset.first().name
                else:
                    key = "_".join([__.name for __ in ops.dataset.itervalues()])
                optimizations = {"tgds": {key: template_tgd}}
            except TypeError:
                optimizations = None

            # load the fields and pass those for optimization
            field_optimizations = {}
            for rd in ops.dataset.itervalues():
                gotten_field = rd.get(format_time=ops.format_time)
                field_optimizations.update({rd.name: gotten_field})
            optimizations = optimizations or {}
            optimizations["fields"] = field_optimizations
        else:
            optimizations = None

        if verbose:
            print("getting tile schema...")
        schema = tile.get_tile_schema(shp[0], shp[1], tile_dimension)
        lschema = len(schema)

        # Create new callbackfunction where the 0-100% range is converted to a subset corresponding to the no. of blocks to be calculated
        if ops.callback is not None:
            percentageDone = 0
            callback = ops.callback

            def newcallback(p, m):
                p = (p / lschema) + percentageDone
                orgcallback(p, m)

            ops.callback = newcallback

        if verbose:
            print("output file is: {0}".format(fill_file))
            print("tile count: {0}".format(lschema))

        fds = nc.Dataset(fill_file, "a")
        try:
            if verbose:
                progress = ProgressBar("tiles progress")
            if ops.callback is not None and callback:
                callback(0, "Initializing calculation")
            for ctr, indices in enumerate(schema.itervalues(), start=1):
                # appropriate adjust the slices to account for the spatial subset
                row = [ii + row_offset for ii in indices["row"]]
                col = [ii + col_offset for ii in indices["col"]]

                # copy the operations and modify arguments
                ops_slice = deepcopy(ops)
                ops_slice.geom = None
                ops_slice.slice = [None, None, None, row, col]
                ops_slice.output_format = "numpy"
                ops_slice.optimizations = optimizations
                # return the object slice
                ret = ops_slice.execute()
                for field_map in ret.itervalues():
                    for field in field_map.itervalues():
                        field_shape = field.shape_as_dict
                        for alias, variable in field.variables.iteritems():
                            vref = fds.variables[alias]
                            assert isinstance(variable.value, np.ma.MaskedArray)
                            # we need to remove the offsets to adjust for the zero-based
                            # fill file.
                            slice_row = slice(row[0] - row_offset, row[1] - row_offset)
                            slice_col = slice(col[0] - col_offset, col[1] - col_offset)
                            # if there is a spatial mask, update accordingly
                            if mask_spatial is not None:
                                set_variable_spatial_mask(variable, mask_spatial, slice_row, slice_col)
                            # squeeze out extra dimensions from ocgis
                            fill_value = np.squeeze(variable.value)
                            # fill the netCDF container variable adjusting for shape
                            if len(vref.shape) == 3:
                                reshape = (field_shape["T"], field_shape["Y"], field_shape["X"])
                                vref[:, slice_row, slice_col] = fill_value.reshape(*reshape)
                            elif len(vref.shape) == 4:
                                reshape = (field_shape["T"], field_shape["Z"], field_shape["Y"], field_shape["X"])
                                vref[:, :, slice_row, slice_col] = fill_value.reshape(*reshape)
                            else:
                                raise (NotImplementedError(vref.shape))

                            # write the data to disk
                            fds.sync()
                if verbose:
                    progress.progress(int((float(ctr) / lschema) * 100))
                if ops.callback is not None and callback:
                    percentageDone = (float(ctr) / lschema) * 100
        finally:
            fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print("complete.")

    return fill_file
Пример #7
0
    def test_tile_get_tile_schema(self):
        schema = tile.get_tile_schema(5, 5, 2)
        self.assertEqual(len(schema), 9)

        schema = tile.get_tile_schema(25, 1, 2)
        self.assertEqual(len(schema), 13)
Пример #8
0
    def test_tile_get_tile_schema(self):
        schema = tile.get_tile_schema(5, 5, 2)
        self.assertEqual(len(schema), 9)

        schema = tile.get_tile_schema(25, 1, 2)
        self.assertEqual(len(schema), 13)
Пример #9
0
def compute(ops, tile_dimension, verbose=False, use_optimizations=True):
    """
    Used for computations on large arrays where memory limitations are a consideration. It is is also useful for
    extracting data from a server that has limitations on the size of requested data arrays. This function creates an
    empty destination NetCDF file that is then filled by executing the operations on chunks of the requested
    target dataset(s) and filling the destination NetCDF file.

    :param ops: The target operations to tile. There must be a calculation associated with
     the operations.
    :type ops: :class:`ocgis.OcgOperations`
    :param int tile_dimension: The target tile/chunk dimension. This integer value must be greater than zero.
    :param bool verbose: If ``True``, print more verbose information to terminal.
    :param bool use_optimizations: If ``True``, cache :class:`Field` and :class:`TemporalGroupDimension` objects for
     reuse during tile iteration.
    :raises: AssertionError, ValuError
    :returns: Path to the output NetCDF file.
    :rtype: str

    >>> from ocgis import RequestDataset, OcgOperations
    >>> from ocgis.util.large_array import compute
    >>> rd = RequestDataset(uri='/path/to/file', variable='tas')
    >>> ops = OcgOperations(dataset=rd,calc=[{'func':'mean','name':'mean'}],output_format='nc')
    >>> ret = compute(ops, 25)
    """

    assert isinstance(ops, OcgOperations)
    assert ops.calc is not None
    assert ops.output_format == constants.OutputFormatName.NETCDF

    # Ensure that progress is not showing 100% at first.
    if ops.callback is not None:
        orgcallback = ops.callback

        def zeropercentagecallback(p, m):
            orgcallback(0., m)

        ops.callback = zeropercentagecallback

    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise ValueError('"tile_dimension" must be greater than 0')

    # Determine if we are working with a multivariate function.
    if CalculationEngine._check_calculation_members_(
            ops.calc, AbstractMultivariateFunction):
        # Only one multivariate calculation allowed.
        assert len(ops.calc) == 1
        has_multivariate = True
    else:
        # Only one dataset allowed.
        assert len(list(ops.dataset)) == 1
        has_multivariate = False

    # work on a copy of the operations to create the template file
    ops_file_only = deepcopy(ops)
    # we need the output to be file only for the first request
    ops_file_only.file_only = True
    # save the environment flag for calculation optimizations.
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC

    try:
        # tell the software we are optimizing for calculations
        ocgis.env.OPTIMIZE_FOR_CALC = True

        # first, write the template file
        if verbose:
            print('getting fill file...')
        fill_file = ops_file_only.execute()

        # if there is a geometry, we have to find the offset for the slice. we
        # also need to account for the subset mask.
        if ops.geom is not None:
            if verbose:
                print(
                    'geometry subset is present. calculating slice offsets...')
            ops_offset = deepcopy(ops)
            ops_offset.output_format = constants.OutputFormatName.OCGIS
            ops_offset.calc = None
            ops_offset.agg_selection = True
            ops_offset.snippet = False
            coll = ops_offset.execute()

            for row in coll.iter_melted(tag=TagName.DATA_VARIABLES):
                assert row['variable']._value is None

            ref_field = coll.get_element()
            ref_grid = ref_field.grid
            row_offset = ref_grid.dimensions[0]._src_idx[0]
            col_offset = ref_grid.dimensions[1]._src_idx[0]
            mask_spatial = ref_grid.get_mask()
        # otherwise the offset is zero...
        else:
            row_offset = 0
            col_offset = 0
            mask_spatial = None

        # get the shape for the tile schema
        if verbose:
            print('getting tile schema shape inputs...')
        shp_variable = ops.calc[0]['name']
        template_rd = ocgis.RequestDataset(uri=fill_file,
                                           variable=shp_variable)
        template_field = template_rd.get()
        shp = template_field.grid.shape

        if use_optimizations:
            # if there is a calculation grouping, optimize for it. otherwise, pass
            # this value as None.
            try:
                # tgd_field = ops.dataset.first().get()
                archetype_dataset = list(ops.dataset)[0]
                tgd_field = archetype_dataset.get()
                template_tgd = tgd_field.temporal.get_grouping(
                    deepcopy(ops.calc_grouping))
                if not has_multivariate:
                    key = archetype_dataset.field_name
                else:
                    key = '_'.join([__.field_name for __ in ops.dataset])
                optimizations = {'tgds': {key: template_tgd}}
            except TypeError:
                optimizations = None

            # load the fields and pass those for optimization
            field_optimizations = {}
            for rd in ops.dataset:
                gotten_field = rd.get(format_time=ops.format_time)
                field_optimizations.update({rd.field_name: gotten_field})
            optimizations = optimizations or {}
            optimizations['fields'] = field_optimizations
        else:
            optimizations = None

        if verbose:
            print('getting tile schema...')
        schema = tile.get_tile_schema(shp[0], shp[1], tile_dimension)
        lschema = len(schema)

        # Create new callbackfunction where the 0-100% range is converted to a subset corresponding to the no. of
        # blocks to be calculated
        if ops.callback is not None:
            percentageDone = 0
            callback = ops.callback

            def newcallback(p, m):
                p = (p / lschema) + percentageDone
                orgcallback(p, m)

            ops.callback = newcallback

        if verbose:
            print(('output file is: {0}'.format(fill_file)))
            print(('tile count: {0}'.format(lschema)))

        fds = nc.Dataset(fill_file, 'a')
        try:
            if verbose:
                progress = ProgressBar('tiles progress')
            if ops.callback is not None and callback:
                callback(0, "Initializing calculation")
            for ctr, indices in enumerate(iter(schema.values()), start=1):
                # appropriate adjust the slices to account for the spatial subset
                row = [ii + row_offset for ii in indices['row']]
                col = [ii + col_offset for ii in indices['col']]

                # copy the operations and modify arguments
                ops_slice = deepcopy(ops)
                ops_slice.geom = None
                ops_slice.slice = [None, None, None, row, col]
                ops_slice.output_format = constants.OutputFormatName.OCGIS
                ops_slice.optimizations = optimizations
                # return the object slice
                ret = ops_slice.execute()

                for field in ret.iter_fields():
                    for variable in field.data_variables:
                        vref = fds.variables[variable.name]
                        # we need to remove the offsets to adjust for the zero-based fill file.
                        slice_row = slice(row[0] - row_offset,
                                          row[1] - row_offset)
                        slice_col = slice(col[0] - col_offset,
                                          col[1] - col_offset)
                        # if there is a spatial mask, update accordingly
                        if mask_spatial is not None:
                            set_variable_spatial_mask(variable, mask_spatial,
                                                      slice_row, slice_col)
                            fill_mask = field.grid.get_mask(create=True)
                            fill_mask[:, :] = mask_spatial[slice_row,
                                                           slice_col]
                            fill_mask = np.ma.array(np.zeros(fill_mask.shape),
                                                    mask=fill_mask)
                            fds.variables[field.grid.mask_variable.name][
                                slice_row, slice_col] = fill_mask
                        fill_value = variable.get_masked_value()
                        # fill the netCDF container variable adjusting for shape
                        if len(vref.shape) == 3:
                            vref[:, slice_row, slice_col] = fill_value
                        elif len(vref.shape) == 4:
                            vref[:, :, slice_row, slice_col] = fill_value
                        else:
                            raise NotImplementedError(vref.shape)

                        fds.sync()
                if verbose:
                    progress.progress(int((float(ctr) / lschema) * 100))
                if ops.callback is not None and callback:
                    percentageDone = ((float(ctr) / lschema) * 100)
        finally:
            fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print('complete.')

    return fill_file
Пример #10
0
def compute(dataset,calc,calc_grouping,tile_dimension,verbose=False,prefix=None):
    '''
    :type dataset: RequestDatasetCollection
    '''
    assert(isinstance(dataset,RequestDatasetCollection))
    assert(type(calc) in (list,tuple))
    
    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise(ValueError('"tile_dimension" must be greater than 0'))
    
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC
    ocgis.env.OPTIMIZE_FOR_CALC = False
    
    try:
        ## load some data into the optimize store
        print('loading into optimize store...')
        for rd in dataset:
            if verbose: print('request dataset',rd.alias)
            ocgis.env._optimize_store[rd.alias] = {}
            ocgis.env._optimize_store[rd.alias]['_value_datetime'] = rd.ds.temporal.value_datetime
            ocgis.env._optimize_store[rd.alias]['_bounds_datetime'] = rd.ds.temporal.bounds_datetime
            if calc_grouping is not None:
                rd.ds.temporal.set_grouping(calc_grouping)
                ocgis.env._optimize_store[rd.alias]['group'] = rd.ds.temporal.group
            rd._ds = None
        
        ## tell the software we are optimizing for calculations   
        ocgis.env.OPTIMIZE_FOR_CALC = True
        ods = NcDataset(request_dataset=dataset[0])
        shp = ods.spatial.grid.shape

        if verbose: print('getting schema...')
        schema = tile.get_tile_schema(shp[0],shp[1],tile_dimension)
        if verbose: print('getting fill file...')
        fill_file = ocgis.OcgOperations(dataset=dataset,file_only=True,
                                      calc=calc,calc_grouping=calc_grouping,
                                      output_format='nc',prefix=prefix).execute()
        if verbose: print('output file is: {0}'.format(fill_file))
        if verbose:
            lschema = len(schema)
            print('tile count: {0}'.format(lschema))
        fds = nc.Dataset(fill_file,'a')
        if verbose:
            progress = ProgressBar('tiles progress')
        for ctr,indices in enumerate(schema.itervalues(),start=1):
            row = indices['row']
            col = indices['col']
            ret = ocgis.OcgOperations(dataset=dataset,slice=[None,row,col],
                                calc=calc,calc_grouping=calc_grouping).execute()
            for vref,v in iter_variable_values(ret[1],fds):
                if len(vref.shape) == 3:
                    vref[:,row[0]:row[1],col[0]:col[1]] = v
                elif len(vref.shape) == 4:
                    vref[:,:,row[0]:row[1],col[0]:col[1]] = v
                else:
                    raise(NotImplementedError(vref.shape))
                fds.sync()
            if verbose:
                progress.progress(int((float(ctr)/lschema)*100))
                
        fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
        ocgis.env._optimize_store = {}
    if verbose:
        progress.endProgress()
        print('complete.')
    return(fill_file)
Пример #11
0
def compute(ops, tile_dimension, verbose=False, use_optimizations=True):
    """
    Used for computations on large arrays where memory limitations are a consideration. It is is also useful for
    extracting data from a server that has limitations on the size of requested data arrays. This function creates an
    empty destination NetCDF file that is then filled by executing the operations on chunks of the requested
    target dataset(s) and filling the destination NetCDF file.

    :param ops: The target operations to tile. There must be a calculation associated with
     the operations.
    :type ops: :class:`ocgis.OcgOperations`
    :param int tile_dimension: The target tile/chunk dimension. This integer value must be greater than zero.
    :param bool verbose: If ``True``, print more verbose information to terminal.
    :param bool use_optimizations: If ``True``, cache :class:`~ocgis.Field` and :class:`~ocgis.TemporalGroupVariable`
     objects for reuse during tile iteration.
    :raises: AssertionError, ValueError
    :returns: Path to the output NetCDF file.
    :rtype: str

    >>> from ocgis import RequestDataset, OcgOperations
    >>> from ocgis.util.large_array import compute
    >>> rd = RequestDataset(uri='/path/to/file', variable='tas')
    >>> ops = OcgOperations(dataset=rd, calc=[{'func':'mean','name':'mean'}],output_format='nc')
    >>> ret = compute(ops, 25)
    """

    assert isinstance(ops, OcgOperations)
    assert ops.output_format == constants.OutputFormatName.NETCDF

    # Ensure that progress is not showing 100% at first.
    if ops.callback is not None:
        orgcallback = ops.callback

        def zeropercentagecallback(p, m):
            orgcallback(0., m)

        ops.callback = zeropercentagecallback

    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise ValueError('"tile_dimension" must be greater than 0')

    # Determine if we are working with a multivariate function.
    if ops.calc is not None:
        if CalculationEngine._check_calculation_members_(ops.calc, AbstractMultivariateFunction):
            # Only one multivariate calculation allowed.
            assert len(ops.calc) == 1
            has_multivariate = True
        else:
            # Only one dataset allowed.
            assert len(list(ops.dataset)) == 1
            has_multivariate = False
    else:
        has_multivariate = False

    # work on a copy of the operations to create the template file
    ops_file_only = deepcopy(ops)
    # we need the output to be file only for the first request
    if ops.calc is not None:
        ops_file_only.file_only = True
    # save the environment flag for calculation optimizations.
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC

    try:
        # tell the software we are optimizing for calculations   
        ocgis.env.OPTIMIZE_FOR_CALC = True

        # first, write the template file
        if verbose:
            print('getting fill file...')
        fill_file = ops_file_only.execute()

        # if there is a geometry, we have to find the offset for the slice. we
        # also need to account for the subset mask.
        if ops.geom is not None:
            if verbose:
                print('geometry subset is present. calculating slice offsets...')
            ops_offset = deepcopy(ops)
            ops_offset.output_format = constants.OutputFormatName.OCGIS
            ops_offset.calc = None
            ops_offset.agg_selection = True
            ops_offset.snippet = False
            coll = ops_offset.execute()

            for row in coll.iter_melted(tag=TagName.DATA_VARIABLES):
                assert row['variable']._value is None

            ref_field = coll.get_element()
            ref_grid = ref_field.grid
            row_offset = ref_grid.dimensions[0]._src_idx[0]
            col_offset = ref_grid.dimensions[1]._src_idx[0]
            mask_spatial = ref_grid.get_mask()
        # otherwise the offset is zero...
        else:
            row_offset = 0
            col_offset = 0
            mask_spatial = None

        # get the shape for the tile schema
        if verbose:
            print('getting tile schema shape inputs...')
        if ops.calc is not None:
            shp_variable = ops.calc[0]['name']
        else:
            shp_variable = None
        template_rd = ocgis.RequestDataset(uri=fill_file, variable=shp_variable)
        template_field = template_rd.get()
        shp = template_field.grid.shape

        if use_optimizations:
            # if there is a calculation grouping, optimize for it. otherwise, pass
            # this value as None.
            try:
                # tgd_field = ops.dataset.first().get()
                archetype_dataset = list(ops.dataset)[0]
                tgd_field = archetype_dataset.get()
                template_tgd = tgd_field.temporal.get_grouping(deepcopy(ops.calc_grouping))
                if not has_multivariate:
                    key = archetype_dataset.field_name
                else:
                    key = '_'.join([__.field_name for __ in ops.dataset])
                optimizations = {'tgds': {key: template_tgd}}
            except TypeError:
                optimizations = None

            # load the fields and pass those for optimization
            field_optimizations = {}
            for rd in ops.dataset:
                gotten_field = rd.get(format_time=ops.format_time)
                field_optimizations.update({rd.field_name: gotten_field})
            optimizations = optimizations or {}
            optimizations['fields'] = field_optimizations
        else:
            optimizations = None

        if verbose:
            print('getting tile schema...')
        schema = tile.get_tile_schema(shp[0], shp[1], tile_dimension)
        lschema = len(schema)

        # Create new callbackfunction where the 0-100% range is converted to a subset corresponding to the no. of
        # blocks to be calculated
        if ops.callback is not None:
            percentageDone = 0
            callback = ops.callback

            def newcallback(p, m):
                p = (p / lschema) + percentageDone
                orgcallback(p, m)

            ops.callback = newcallback

        if verbose:
            print(('output file is: {0}'.format(fill_file)))
            print(('tile count: {0}'.format(lschema)))

        fds = nc.Dataset(fill_file, 'a')
        try:
            if verbose:
                progress = ProgressBar('tiles progress')
            if ops.callback is not None and callback:
                callback(0, "Initializing calculation")
            for ctr, indices in enumerate(iter(schema.values()), start=1):
                # appropriate adjust the slices to account for the spatial subset
                row = [ii + row_offset for ii in indices['row']]
                col = [ii + col_offset for ii in indices['col']]

                # copy the operations and modify arguments
                ops_slice = deepcopy(ops)
                ops_slice.geom = None
                ops_slice.slice = [None, None, None, row, col]
                ops_slice.output_format = constants.OutputFormatName.OCGIS
                ops_slice.optimizations = optimizations
                # return the object slice
                ret = ops_slice.execute()

                for field in ret.iter_fields():
                    for variable in field.data_variables:
                        vref = fds.variables[variable.name]
                        # we need to remove the offsets to adjust for the zero-based fill file.
                        slice_row = slice(row[0] - row_offset, row[1] - row_offset)
                        slice_col = slice(col[0] - col_offset, col[1] - col_offset)
                        # if there is a spatial mask, update accordingly
                        if mask_spatial is not None:
                            set_variable_spatial_mask(variable, mask_spatial, slice_row, slice_col)
                            fill_mask = field.grid.get_mask(create=True)
                            fill_mask[:, :] = mask_spatial[slice_row, slice_col]
                            fill_mask = np.ma.array(np.zeros(fill_mask.shape), mask=fill_mask)
                            fds.variables[field.grid.mask_variable.name][slice_row, slice_col] = fill_mask
                        fill_value = variable.get_masked_value()
                        # fill the netCDF container variable adjusting for shape
                        if len(vref.shape) == 3:
                            vref[:, slice_row, slice_col] = fill_value
                        elif len(vref.shape) == 4:
                            vref[:, :, slice_row, slice_col] = fill_value
                        else:
                            raise NotImplementedError(vref.shape)

                        fds.sync()
                if verbose:
                    progress.progress(int((float(ctr) / lschema) * 100))
                if ops.callback is not None and callback:
                    percentageDone = ((float(ctr) / lschema) * 100)
        finally:
            fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print('complete.')

    return fill_file
Пример #12
0
def compute(dataset,
            calc,
            calc_grouping,
            tile_dimension,
            verbose=False,
            prefix=None):
    '''
    :type dataset: RequestDatasetCollection
    '''
    assert (isinstance(dataset, RequestDatasetCollection))
    assert (type(calc) in (list, tuple))

    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise (ValueError('"tile_dimension" must be greater than 0'))

    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC
    ocgis.env.OPTIMIZE_FOR_CALC = False

    try:
        ## load some data into the optimize store
        print('loading into optimize store...')
        for rd in dataset:
            if verbose: print('request dataset', rd.alias)
            ocgis.env._optimize_store[rd.alias] = {}
            ocgis.env._optimize_store[
                rd.alias]['_value_datetime'] = rd.ds.temporal.value_datetime
            ocgis.env._optimize_store[
                rd.alias]['_bounds_datetime'] = rd.ds.temporal.bounds_datetime
            if calc_grouping is not None:
                rd.ds.temporal.set_grouping(calc_grouping)
                ocgis.env._optimize_store[
                    rd.alias]['group'] = rd.ds.temporal.group
            rd._ds = None

        ## tell the software we are optimizing for calculations
        ocgis.env.OPTIMIZE_FOR_CALC = True
        ods = NcDataset(request_dataset=dataset[0])
        shp = ods.spatial.grid.shape

        if verbose: print('getting schema...')
        schema = tile.get_tile_schema(shp[0], shp[1], tile_dimension)
        if verbose: print('getting fill file...')
        fill_file = ocgis.OcgOperations(dataset=dataset,
                                        file_only=True,
                                        calc=calc,
                                        calc_grouping=calc_grouping,
                                        output_format='nc',
                                        prefix=prefix).execute()
        if verbose: print('output file is: {0}'.format(fill_file))
        if verbose:
            lschema = len(schema)
            print('tile count: {0}'.format(lschema))
        fds = nc.Dataset(fill_file, 'a')
        if verbose:
            progress = ProgressBar('tiles progress')
        for ctr, indices in enumerate(schema.itervalues(), start=1):
            row = indices['row']
            col = indices['col']
            ret = ocgis.OcgOperations(dataset=dataset,
                                      slice=[None, row, col],
                                      calc=calc,
                                      calc_grouping=calc_grouping).execute()
            for vref, v in iter_variable_values(ret[1], fds):
                if len(vref.shape) == 3:
                    vref[:, row[0]:row[1], col[0]:col[1]] = v
                elif len(vref.shape) == 4:
                    vref[:, :, row[0]:row[1], col[0]:col[1]] = v
                else:
                    raise (NotImplementedError(vref.shape))
                fds.sync()
            if verbose:
                progress.progress(int((float(ctr) / lschema) * 100))

        fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
        ocgis.env._optimize_store = {}
    if verbose:
        progress.endProgress()
        print('complete.')
    return (fill_file)
Пример #13
0
def compute(dataset,calc,calc_grouping,tile_dimension,verbose=False,prefix=None):
    '''
    :type dataset: RequestDatasetCollection
    '''
    assert(isinstance(dataset,RequestDatasetCollection))
    assert(type(calc) in (list,tuple))
    
    tile_dimension = int(tile_dimension)
    if tile_dimension <= 0:
        raise(ValueError('"tile_dimension" must be greater than 0'))
    
    orig_oc = ocgis.env.OPTIMIZE_FOR_CALC
    ocgis.env.OPTIMIZE_FOR_CALC = False
    
    try:
        
        ## tell the software we are optimizing for calculations   
        ocgis.env.OPTIMIZE_FOR_CALC = True
        ods = dataset[0].get()
#        ods = NcDataset(request_dataset=dataset[0])
        shp = ods.shape[-2:]

        if verbose: print('getting schema...')
        schema = tile.get_tile_schema(shp[0],shp[1],tile_dimension)
        if verbose: print('getting fill file...')
        fill_file = ocgis.OcgOperations(dataset=dataset,file_only=True,
                                      calc=calc,calc_grouping=calc_grouping,
                                      output_format='nc',prefix=prefix).execute()
        if verbose: print('output file is: {0}'.format(fill_file))
        if verbose:
            lschema = len(schema)
            print('tile count: {0}'.format(lschema))
        fds = nc.Dataset(fill_file,'a')
        if verbose:
            progress = ProgressBar('tiles progress')
        for ctr,indices in enumerate(schema.itervalues(),start=1):
            row = indices['row']
            col = indices['col']
            ret = ocgis.OcgOperations(dataset=dataset,slice=[None,None,None,row,col],
                                calc=calc,calc_grouping=calc_grouping).execute()
            for field_map in ret.itervalues():
                for field in field_map.itervalues():
                    for alias,variable in field.variables.iteritems():
                        vref = fds.variables[alias]
                        if len(vref.shape) == 3:
                            vref[:,row[0]:row[1],col[0]:col[1]] = np.squeeze(variable.value)
                        elif len(vref.shape) == 4:
                            vref[:,:,row[0]:row[1],col[0]:col[1]] = np.squeeze(variable.value)
                        else:
                            raise(NotImplementedError(vref.shape))
                        fds.sync()
#                        import ipdb;ipdb.set_trace()
#            for vref,v in iter_variable_values(ret[1],fds):
#                if len(vref.shape) == 3:
#                    vref[:,row[0]:row[1],col[0]:col[1]] = v
#                elif len(vref.shape) == 4:
#                    vref[:,:,row[0]:row[1],col[0]:col[1]] = v
#                else:
#                    raise(NotImplementedError(vref.shape))
#                fds.sync()
            if verbose:
                progress.progress(int((float(ctr)/lschema)*100))
                
        fds.close()
    finally:
        ocgis.env.OPTIMIZE_FOR_CALC = orig_oc
    if verbose:
        progress.endProgress()
        print('complete.')
    return(fill_file)
Пример #14
0
def main():
    ocgis.env.OPTIMIZE_FOR_CALC = True
    #    215 minutes
    uri = '/tmp/gridded_obs.tasmax.OBS_125deg.daily.1950-1999.nc'
    #    uri = '/usr/local/climate_data/maurer/bcca/obs/tasmax/1_8deg/gridded_obs.tasmax.OBS_125deg.daily.1950.nc'
    #        uri = '/usr/local/climate_data/daymet/tmax.nc'
    variable = 'tasmax'
    #        variable = 'tmax'
    rd = ocgis.RequestDataset(uri, variable)
    import netCDF4 as nc
    ods = ocgis.api.dataset.dataset.OcgDataset(rd)
    shp = ods.i.spatial.shape
    print('getting schema...')
    schema = tile.get_tile_schema(shp[0], shp[1], 100)
    calc = [{
        'func': 'mean',
        'name': 'my_mean'
    }, {
        'func': 'freq_perc',
        'name': 'perc_90',
        'kwds': {
            'perc': 90,
        }
    }, {
        'func': 'freq_perc',
        'name': 'perc_95',
        'kwds': {
            'perc': 95,
        }
    }, {
        'func': 'freq_perc',
        'name': 'perc_99',
        'kwds': {
            'perc': 99,
        }
    }]
    print('getting fill file...')
    fill_file = ocgis.OcgOperations(dataset=rd,
                                    file_only=True,
                                    calc=calc,
                                    calc_grouping=['month'],
                                    output_format='nc').execute()
    print fill_file, len(schema)
    fds = nc.Dataset(fill_file, 'a')
    t1 = time.time()
    for tile_id, indices in schema.iteritems():
        print tile_id
        row = indices['row']
        col = indices['col']
        ret = ocgis.OcgOperations(dataset=rd,
                                  slice_row=row,
                                  slice_column=col,
                                  calc=calc,
                                  calc_grouping=['month'],
                                  abstraction='point').execute()
        ref = ret[1].variables[variable].calc_value
        for k, v in ref.iteritems():
            vref = fds.variables[k]
            if len(vref.shape) == 3:
                vref[:, row[0]:row[1], col[0]:col[1]] = v
            elif len(vref.shape) == 4:
                vref[:, :, row[0]:row[1], col[0]:col[1]] = v
            else:
                raise (NotImplementedError)
            fds.sync()
    fds.close()
    print((time.time() - t1) / 60.0)