예제 #1
0
파일: cr.py 프로젝트: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(daxes.profile, num_profiles)
            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(daxes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4',
                                      (daxes.profile, ))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, ))

            # Create all of the axis variables
            time = nc.createVariable(axes.t,
                                     'f8', (daxes.profile, ),
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            latitude = nc.createVariable(
                axes.y,
                get_dtype(df[axes.y]), (daxes.profile, ),
                fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(
                axes.x,
                get_dtype(df[axes.x]), (daxes.profile, ),
                fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [d for d in df.columns if d not in axes]
            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the profile axis
            profile_vars = kwargs.pop('profile_vars', [])
            profile_columns = [p for p in profile_vars if p in data_columns]
            for c in profile_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.profile, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t],
                                                     time).astype('f8')[0]
                    latitude[j] = get_ncdata_from_series(
                        pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(
                        pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

                    # Save any profile variables on the "profile" index using the first value found
                    # in the column.
                    for c in profile_columns:
                        var_name = cf_safe_name(c)
                        if var_name not in nc.variables:
                            continue
                        v = nc.variables[var_name]
                        vvalues = get_ncdata_from_series(pfg[c], v)[0]
                        try:
                            v[j] = vvalues
                        except BaseException:
                            L.exception('Failed to add {}'.format(c))
                            continue

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            sample_columns = [
                f for f in data_columns + [axes.z] if f not in profile_columns
            ]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [d for d in sample_columns if d not in skips]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
예제 #2
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        # Downcast anything from int64 to int32
        df = downcast_dataframe(df)

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts():
                    return np.s_[:, :]

                default_dimensions = (axes.t, axes.z)
                station_dimensions = ()
            else:

                def ts():
                    return np.s_[:, :, :]

                default_dimensions = (axes.t, axes.z, axes.station)
                station_dimensions = (axes.station, )
                nc.createDimension(axes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(axes.t, None)
            else:
                nc.createDimension(axes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            time[:] = nc4.date2num(unique_t, units=cls.default_time_unit)

            nc.createDimension(axes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (axes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            for c in data_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[ts()] = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
예제 #3
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby(axes.profile)

            if unlimited is True:
                max_profiles = None
            else:
                max_profiles = df[axes.profile].unique().size
            nc.createDimension(daxes.profile, max_profiles)

            max_zs = profile_group.size().max()
            nc.createDimension(daxes.z, max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', (daxes.profile, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (daxes.profile, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (daxes.profile, ))
            z = nc.createVariable(
                axes.z,
                get_dtype(df[axes.z]), (daxes.profile, daxes.z),
                fill_value=df[axes.z].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 (daxes.profile, daxes.z),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })

            # Write values for each profile within profile_group
            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = date2num(pdf[axes.t].iloc[0],
                                   units=cls.default_time_unit)
                latitude[i] = pdf[axes.y].iloc[0]
                longitude[i] = pdf[axes.x].iloc[0]

                zvalues = pdf[axes.z].fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues

                for c in data_columns:
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(pdf[c], v)

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)
예제 #4
0
파일: r.py 프로젝트: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with RaggedTimeseriesProfile(output, 'w') as nc:

            station_groups = df.groupby(axes.station)
            unique_stations = list(station_groups.groups.keys())
            num_stations = len(unique_stations)

            # Calculate the max number of profiles
            profile_groups = df.groupby(axes.profile)
            unique_profiles = list(profile_groups.groups.keys())
            num_profiles = len(unique_profiles)
            nc.createDimension(daxes.profile, num_profiles)

            if reduce_dims is True and num_stations == 1:
                # If a singular station, remove the dimension
                station_dimensions = ()
                s_ind = None
            else:
                station_dimensions = (daxes.station,)
                nc.createDimension(daxes.station, num_stations)
                # The station this profile belongs to
                s_ind = nc.createVariable('stationIndex', 'i4', (daxes.profile,))

            station = nc.createVariable(axes.station, get_dtype(unique_stations), station_dimensions)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile,))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions)

            # Get unique obs by grouping on traj and profile and getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile,))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            data_columns += [axes.t, axes.z]  # time isn't really special, its dimensioned by obs
            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            for i, (sname, srg) in enumerate(station_groups):
                station[i] = sname
                latitude[i] = df[axes.y][df[axes.station] == sname].dropna().iloc[0]
                longitude[i] = df[axes.x][df[axes.station] == sname].dropna().iloc[0]

            for j, (pname, pfg) in enumerate(profile_groups):
                profile[j] = pname
                row_size[j] = len(pfg)
                if s_ind is not None:
                    s_ind[j] = np.asscalar(np.argwhere(station[:] == pfg[axes.station].dropna().iloc[0]))

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            skips = ['stationIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (daxes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return RaggedTimeseriesProfile(output, **kwargs)
예제 #5
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)
            has_z = axes.z is not None

            if reduce_dims is True and num_stations == 1:
                # If a station, we can reduce that dimension if it is of size 1
                def ts(i):
                    return np.s_[:]

                default_dimensions = (daxes.t, )
                station_dimensions = ()
            else:

                def ts(i):
                    return np.s_[i, :]

                default_dimensions = (daxes.station, daxes.t)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, num_stations)

            # Set the coordinates attribute correctly
            coordinates = [axes.t, axes.x, axes.y]
            if has_z is True:
                coordinates.insert(1, axes.z)
            coordinates = ' '.join(coordinates)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(daxes.t, t.size)
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            station = nc.createVariable(axes.station,
                                        get_dtype(df[axes.station]),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            if has_z is True:
                z = nc.createVariable(axes.z,
                                      get_dtype(df[axes.z]),
                                      station_dimensions,
                                      fill_value=df[axes.z].dtype.type(
                                          cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            time[:] = get_ncdata_from_series(t, time)

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}),
                        {'coordinates': coordinates})

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                if has_z is True:
                    # TODO: write a test for a Z with a _FillValue
                    z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[ts(i)] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
예제 #6
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute
        _ = kwargs.pop('reduce_dims', False)  # noqa
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectory(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)
            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))

            # Get unique obs by grouping on traj getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each trajectory
            row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, ))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the trajectory axis
            traj_vars = kwargs.pop('traj_vars', [])
            traj_columns = [p for p in traj_vars if p in df.columns]
            for c in traj_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.trajectory, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (trajid, trg) in enumerate(trajectory_groups):
                trajectory[i] = trajid
                row_size[i] = len(trg)

                # Save any trajectory variables using the first value found
                # in the column.
                for c in traj_columns:
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        continue
                    v = nc.variables[var_name]
                    vvalues = get_ncdata_from_series(trg[c], v)[0]
                    try:
                        v[i] = vvalues
                    except BaseException:
                        L.exception('Failed to add {}'.format(c))
                        continue

            # Add all of the columns based on the sample dimension. Take all columns and remove the
            # trajectory, rowSize and other trajectory based columns.
            sample_columns = [
                f for f in df.columns
                if f not in traj_columns + ['rowSize', axes.trajectory]
            ]
            for c in sample_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectory(output, **kwargs)
예제 #7
0
파일: om.py 프로젝트: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singular trajectory, we can reduce that dimension if it is of size 1
                default_dimensions = (daxes.t, daxes.z)
                station_dimensions = ()
            else:
                default_dimensions = (daxes.t, daxes.z, daxes.station)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(daxes.t, None)
            else:
                nc.createDimension(daxes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            time[:] = date2num(unique_t,
                               units=cls.default_time_unit).astype('f8')

            nc.createDimension(daxes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the time axis and not the depth axis
            detach_z_vars = kwargs.pop('detach_z', [])
            detach_z_columnms = [p for p in detach_z_vars if p in data_columns]
            for c in detach_z_columnms:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions[
                            0::2],  # this removes the second dimension (z)
                        df[c],
                        zlib=True,
                        complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {}'.format(axes.t, axes.x, axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                # Because we need access to the fillvalues here, we ask not to return
                # the values with them already filled.
                vvalues = get_ncdata_from_series(df[c], v, fillna=False)
                # Reshape to the full array, with Z
                vvalues = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)
                # The Z axis is always the second axis, take the mean over that axis
                vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten()
                # Now reshape to the array without Z
                vvalues = vvalues.reshape(len(unique_t), unique_s.size)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            full_columns = [
                f for f in data_columns if f not in detach_z_columnms
            ]
            for c in full_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[:] = vvalues.reshape(v.shape)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
예제 #8
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [ d for d in df.columns if d not in axes ]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby(axes.trajectory)

            if unlimited is True:
                max_obs = None
            else:
                max_obs = trajectory_group.size().max()
            nc.createDimension(daxes.sample, max_obs)

            num_trajectories = len(trajectory_group)
            if reduce_dims is True and num_trajectories == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts(t_index, size):
                    return np.s_[0:size]
                default_dimensions = (daxes.sample,)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]))
            else:
                def ts(t_index, size):
                    return np.s_[t_index, 0:size]
                default_dimensions = (daxes.trajectory, daxes.sample)
                nc.createDimension(daxes.trajectory, num_trajectories)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory,))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type(cls.default_fill_value))
            z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions,
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                    attributes[var_name] = dict_update(attributes.get(var_name, {}), {
                        'coordinates': '{} {} {} {}'.format(
                            axes.t, axes.z, axes.x, axes.y
                        )
                    })

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                times = get_ncdata_from_series(gdf[axes.t], time)
                time[ts(i, times.size)] = times

                lats = get_ncdata_from_series(gdf[axes.y], latitude)
                latitude[ts(i, lats.size)] = lats

                lons = get_ncdata_from_series(gdf[axes.x], longitude)
                longitude[ts(i, lons.size)] = lons

                zs = gdf[axes.z].fillna(get_fill_value(z)).values
                z[ts(i, zs.size)] = zs

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(gdf[c], v)
                    slicer = ts(i, vvalues.size)
                    v[slicer] = vvalues

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
    "longitude": {"units": "degrees_east", "standard_name": "longitude",},
    "latitude": {"units": "degrees_north", "standard_name": "latitude",},
    "z": {"units": "m", "standard_name": "depth", "positive": "down",},
    "u": {"units": "m/s", "standard_name": "eastward_sea_water_velocity",},
    "v": {"units": "m/s", "standard_name": "northward_sea_water_velocity",},
    "station": {"cf_role": "timeseries_id"},
}

We also need to map the our data axes to [`pocean`'s defaults](https://github.com/pyoceans/pocean-core/blob/master/pocean/utils.py#L50-L59). This step is not needed if the data axes are already named like the default ones.

axes = {"t": "time", "x": "longitude", "y": "latitude", "z": "depth"}

from pocean.dsg.timeseries.om import OrthogonalMultidimensionalTimeseries
from pocean.utils import downcast_dataframe

df = downcast_dataframe(df)  # safely cast depth np.int64 to np.int32
dsg = OrthogonalMultidimensionalTimeseries.from_dataframe(
    df, output="fake_buoy.nc", attributes=attributes, axes=axes,
)

The `OrthogonalMultidimensionalTimeseries` saves the DataFrame into a CF-1.6 TimeSeries DSG.

!ncdump -h fake_buoy.nc

 It also outputs the dsg object for inspection. Let us check a few things to see if our objects was created as expected. (Note that some of the metadata was "free" due t the built-in defaults in `pocean`.

dsg.getncattr("featureType")

type(dsg)

In addition to standard `netCDF4-python` object `.variables` method `pocean`'s DSGs provides an "categorized" version of the variables in the `data_vars`, `ancillary_vars`, and the DSG axes methods.