예제 #1
0
def open_multifile_dataset(fileNames, calendar, config,
                           simulationStartTime=None,
                           timeVariableName='Time',
                           variableList=None, selValues=None,
                           iselValues=None, variableMap=None,
                           startDate=None, endDate=None,
                           chunking=None):  # {{{
    """
    Opens and returns an xarray data set given file name(s) and the MPAS
    calendar name.

    Parameters
    ----------
    fileNames : list of strings
        A lsit of file paths to read

    calendar : {``'gregorian'``, ``'gregorian_noleap'``}, optional
        The name of one of the calendars supported by MPAS cores

    config :  instance of ``MpasAnalysisConfigParser``
        Contains configuration options

    simulationStartTime : string, optional
        The start date of the simulation, used to convert from time variables
        expressed as days since the start of the simulation to days since the
        reference date. ``simulationStartTime`` takes one of the following
        forms::

            0001-01-01
            0001-01-01 00:00:00

        ``simulationStartTime`` is only required if the MPAS time variable
        (identified by ``timeVariableName``) is a number of days since the
        start of the simulation.

    timeVariableName : string, optional
        The name of the time variable (typically ``'Time'`` if using a
        ``variableMap`` or ``'xtime'`` if not using a ``variableMap``)

    variableList : list of strings, optional
        If present, a list of variables to be included in the data set

    selValues : dict, optional
        A dictionary of coordinate names (keys) and values or arrays of
        values used to slice the variales in the data set.  See
        ``xarray.DataSet.sel()`` for details on how this dictonary is used.
        An example::

            selectCorrdValues = {'cellLon': 180.0}

    iselValues : dict, optional
        A dictionary of coordinate names (keys) and indices, slices or
        arrays of indices used to slice the variales in the data set.  See
        ``xarray.DataSet.isel()`` for details on how this dictonary is used.
        An example::

            iselValues = {'nVertLevels': slice(0, 3),
                          'nCells': cellIDs}

    variableMap : dict, optional
        A dictionary with keys that are variable names used by
        MPAS-Analysis and values that are lists of possible names for the same
        variable in the MPAS dycore that produced the data set (which may
        differ between versions).

    startDate, endDate : string or datetime.datetime, optional
        If present, the first and last dates to be used in the data set.  The
        time variable is sliced to only include dates within this range.

    chunking : None, int, True, dict, optional
        If integer is present, applies maximum chunk size from config file
        value ``maxChunkSize``, otherwise if None do not perform chunking.  If
        True, use automated chunking using default config value
        ``maxChunkSize``. If chunking is a dict use dictionary values for
        chunking.

    Returns
    -------
    ds : ``xarray.Dataset``

    Raises
    ------
    TypeError
        If the time variable has an unsupported type (not a date string,
        a floating-pont number of days since the start of the simulation
        or a ``numpy.datatime64`` object).

    ValueError
        If the time variable is not found in the data set or if the time
        variable is a number of days since the start of the simulation but
        simulationStartTime is None.
    """
    # Authors
    # -------
    # Xylar Asay-Davis, Phillip J. Wolfram

    preprocess_partial = partial(_preprocess,
                                 calendar=calendar,
                                 simulationStartTime=simulationStartTime,
                                 timeVariableName=timeVariableName,
                                 variableList=variableList,
                                 selValues=selValues,
                                 iselValues=iselValues,
                                 variableMap=variableMap,
                                 startDate=startDate,
                                 endDate=endDate)

    kwargs = {'decode_times': False,
              'concat_dim': 'Time'}

    autocloseFileLimitFraction = config.getfloat('input',
                                                 'autocloseFileLimitFraction')

    # get the number of files that can be open at the same time.  We want the
    # "soft" limit because we'll get a crash if we exceed it.
    softLimit = resource.getrlimit(resource.RLIMIT_NOFILE)[0]

    # use autoclose if we will use more than autocloseFileLimitFraction (50%
    # by default) of the soft limit of open files
    autoclose = len(fileNames) > softLimit*autocloseFileLimitFraction

    try:
        ds = xarray.open_mfdataset(fileNames,
                                   preprocess=preprocess_partial,
                                   autoclose=autoclose, **kwargs)
    except TypeError as e:
        if 'autoclose' in str(e):
            if autoclose:
                # This indicates that xarray version doesn't support autoclose
                print('Warning: open_multifile_dataset is trying to use '
                      'autoclose=True but\n'
                      'it appears your xarray version doesn\'t support this '
                      'argument. Will\n'
                      'try again without autoclose argument.')

            ds = xarray.open_mfdataset(fileNames,
                                       preprocess=preprocess_partial,
                                       **kwargs)
        else:
            raise e

    ds = mpas_xarray.remove_repeated_time_index(ds)

    if startDate is not None and endDate is not None:
        if isinstance(startDate, six.string_types):
            startDate = string_to_days_since_date(dateString=startDate,
                                                  calendar=calendar)
        if isinstance(endDate, six.string_types):
            endDate = string_to_days_since_date(dateString=endDate,
                                                calendar=calendar)

    # select only the data in the specified range of dates
    ds = ds.sel(Time=slice(startDate, endDate))

    if ds.dims['Time'] == 0:
        raise ValueError('The data set contains no Time entries between '
                         'dates {} and {}.'.format(
                             days_to_datetime(startDate, calendar=calendar),
                             days_to_datetime(endDate, calendar=calendar)))
    # process chunking
    if chunking is True:
        # limit chunk size to prevent memory error
        chunking = config.getint('input', 'maxChunkSize')

    ds = mpas_xarray.process_chunking(ds, chunking)

    # private record of autoclose use
    ds.attrs['_autoclose'] = int(autoclose)

    return ds  # }}}
예제 #2
0
def _parse_dataset_time(ds, inTimeVariableName, calendar, simulationStartTime,
                        outTimeVariableName, referenceDate):  # {{{
    """
    A helper function for computing a time coordinate from an MPAS time
    variable.  Given a data set and a time variable name (or tuple of 2
    time names), returns a new data set with time coordinate
    `outTimeVariableName` filled with days since `referenceDate`

    Parameters
    ----------
    ds : xarray.DataSet object
        The data set containing an MPAS time variable to be used to build
        an xarray time coordinate.

    inTimeVariableName : string or tuple or list of strings
        The name of the time variable in the MPAS data set that will be
        used to build the 'Time' coordinate.  The array(s) named by
        inTimeVariableName should contain date strings or the number of
        days since the start of the simulation. Typically,
        inTimeVariableName is one of {'daysSinceStartOfSim','xtime'}.
        If a list of two variable
        names is provided, times from the two are averaged together to
        determine the value of the time coordinate.  In such cases,
        inTimeVariableName is typically {['xtime_start', 'xtime_end']}.

    calendar : {'gregorian', 'gregorian_noleap'}
        The name of one of the calendars supported by MPAS cores


    simulationStartTime : string
        The start date of the simulation, used to convert from time variables
        expressed as days since the start of the simulation to days since the
        reference date. `simulationStartTime` takes one of the following
        forms::

            0001-01-01
            0001-01-01 00:00:00

        simulationStartTime is only required if the MPAS time variable
        (identified by timeVariableName) is a number of days since the
        start of the simulation.

    outTimeVariableName : string
        The name of the coordinate to assign times to, typically 'Time'.

    referenceDate : string
        The reference date for the time variable, typically '0001-01-01',
        taking one of the following forms::

            0001-01-01
            0001-01-01 00:00:00

    Returns
    -------
    dataset : xarray.dataset object
        A copy of the input data set with the `outTimeVariableName`
        coordinate containing the time coordinate parsed from
        `inTimeVariableName`.

    Raises
    ------
    TypeError
        If the time variable has an unsupported type (not a date string
        or a floating-pont number of days since the start of the simulatio).
    ValueError
        If  the time variable is a number of days since the start of the
        simulation but simulationStartTime is None.
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    if isinstance(inTimeVariableName, (tuple, list)):
        # we want to average the two
        assert (len(inTimeVariableName) == 2)

        dsStart = _parse_dataset_time(ds=ds,
                                      inTimeVariableName=inTimeVariableName[0],
                                      calendar=calendar,
                                      simulationStartTime=simulationStartTime,
                                      outTimeVariableName=outTimeVariableName,
                                      referenceDate=referenceDate)
        dsEnd = _parse_dataset_time(ds=ds,
                                    inTimeVariableName=inTimeVariableName[1],
                                    calendar=calendar,
                                    simulationStartTime=simulationStartTime,
                                    outTimeVariableName=outTimeVariableName,
                                    referenceDate=referenceDate)
        starts = dsStart[outTimeVariableName].values
        ends = dsEnd[outTimeVariableName].values

        # replace the time in starts with the mean of starts and ends
        dsOut = dsStart.copy()

        dsOut.coords['startTime'] = (outTimeVariableName, starts)
        dsOut.coords['endTime'] = (outTimeVariableName, ends)

        dsOut.coords[outTimeVariableName] = (outTimeVariableName, [
            starts[i] + (ends[i] - starts[i]) / 2 for i in range(len(starts))
        ])

    else:

        # there is just one time variable (either because we're recursively
        # calling the function or because we're not averaging).

        # The contents of the time variable is expected to be either a string
        # (|S64) or a float (meaning days since start of the simulation).

        timeVar = ds[inTimeVariableName]

        if timeVar.dtype == '|S64':
            # this is an array of date strings like 'xtime'
            # convert to string
            timeStrings = [
                ''.join(str(xtime.astype('U'))).strip()
                for xtime in timeVar.values
            ]
            days = string_to_days_since_date(dateString=timeStrings,
                                             referenceDate=referenceDate,
                                             calendar=calendar)

        elif timeVar.dtype == 'float64':
            # this array contains floating-point days like
            # 'daysSinceStartOfSim'

            if simulationStartTime is None:
                raise ValueError('MPAS time variable {} appears to be a '
                                 'number of days since start \n'
                                 'of sim but simulationStartTime was not'
                                 '  supplied.'.format(inTimeVariableName))

            if (string_to_datetime(referenceDate) == string_to_datetime(
                    simulationStartTime)):
                days = timeVar.values
            else:
                # a conversion may be required
                dates = days_to_datetime(days=timeVar.values,
                                         referenceDate=simulationStartTime,
                                         calendar=calendar)
                days = datetime_to_days(dates=dates,
                                        referenceDate=referenceDate,
                                        calendar=calendar)

        elif timeVar.dtype == 'timedelta64[ns]':
            raise TypeError('timeVar of unsupported type {}.  This is likely '
                            'because xarray.open_dataset \n'
                            'was called with decode_times=True, which can '
                            'mangle MPAS times.'.format(timeVar.dtype))
        else:
            raise TypeError("timeVar of unsupported type {}".format(
                timeVar.dtype))

        dsOut = ds.copy()
        dsOut.coords[outTimeVariableName] = (outTimeVariableName, days)

    return dsOut  # }}}
예제 #3
0
def open_mpas_dataset(
        fileName,
        calendar,
        timeVariableNames=['xtime_startMonthly', 'xtime_endMonthly'],
        variableList=None,
        startDate=None,
        endDate=None):  # {{{
    """
    Opens and returns an xarray data set given file name(s) and the MPAS
    calendar name.

    Parameters
    ----------
    fileName : str
        File path to read

    calendar : {``'gregorian'``, ``'gregorian_noleap'``}, optional
        The name of one of the calendars supported by MPAS cores

    timeVariableNames : str or list of 2 str, optional
        The name of the time variable (typically ``'xtime'``
        or ``['xtime_startMonthly', 'xtime_endMonthly']``), or ``None`` if
        time does not need to be parsed (and is already in the ``Time``
        variable)

    variableList : list of strings, optional
        If present, a list of variables to be included in the data set

    startDate, endDate : string or datetime.datetime, optional
        If present, the first and last dates to be used in the data set.  The
        time variable is sliced to only include dates within this range.

    Returns
    -------
    ds : ``xarray.Dataset``

    Raises
    ------
    TypeError
        If the time variable has an unsupported type (not a date string).

    ValueError
        If the time variable is not found in the data set
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    ds = xarray.open_dataset(fileName,
                             decode_cf=True,
                             decode_times=False,
                             lock=False)

    if timeVariableNames is not None:
        ds = _parse_dataset_time(ds, timeVariableNames, calendar)

    if startDate is not None and endDate is not None:
        if isinstance(startDate, six.string_types):
            startDate = string_to_days_since_date(dateString=startDate,
                                                  calendar=calendar)
        if isinstance(endDate, six.string_types):
            endDate = string_to_days_since_date(dateString=endDate,
                                                calendar=calendar)

        # select only the data in the specified range of dates
        ds = ds.sel(Time=slice(startDate, endDate))

    if ds.dims['Time'] == 0:
        raise ValueError('The data set contains no Time entries between '
                         'dates {} and {}.'.format(
                             days_to_datetime(startDate, calendar=calendar),
                             days_to_datetime(endDate, calendar=calendar)))
    if variableList is not None:
        ds = subset_variables(ds, variableList)

    return ds  # }}}
예제 #4
0
    def run_task(self):  # {{{
        '''
        Computes NINO34 index and plots the time series and power spectrum with
        95 and 99% confidence bounds
        '''
        # Authors
        # -------
        # Luke Van Roekel, Xylar Asay-Davis

        config = self.config
        calendar = self.calendar

        regionToPlot = config.get('indexNino34', 'region')

        ninoIndexNumber = regionToPlot[4:]

        self.logger.info("\nPlotting El Nino {} Index time series and power "
                         "spectrum....".format(ninoIndexNumber))

        self.logger.info('  Load SST data...')
        fieldName = 'nino'

        startDate = self.config.get('index', 'startDate')
        endDate = self.config.get('index', 'endDate')

        startYear = self.config.getint('index', 'startYear')
        endYear = self.config.getint('index', 'endYear')

        dataSource = config.get('indexNino34', 'observationData')

        observationsDirectory = build_obs_path(
            config, 'ocean', '{}Subdirectory'.format(fieldName))

        # specify obsTitle based on data path
        # These are the only data sets supported
        if dataSource == 'HADIsst':
            dataPath = "{}/HADIsst_nino34_20180710.nc".format(
                observationsDirectory)
            obsTitle = 'HADSST'
            refDate = '1870-01-01'
        elif dataSource == 'ERS_SSTv4':
            dataPath = "{}/ERS_SSTv4_nino34_20180710.nc".format(
                observationsDirectory)
            obsTitle = 'ERS SSTv4'
            refDate = '1800-01-01'
        else:
            raise ValueError('Bad value for config option observationData {} '
                             'in [indexNino34] section.'.format(dataSource))

        mainRunName = config.get('runs', 'mainRunName')

        # regionIndex should correspond to NINO34 in surface weighted Average
        # AM
        regions = config.getExpression('regions', 'regions')
        regionToPlot = config.get('indexNino34', 'region')
        regionIndex = regions.index(regionToPlot)

        # Load data:
        ds = open_mpas_dataset(fileName=self.inputFile,
                               calendar=calendar,
                               variableList=self.variableList,
                               startDate=startDate,
                               endDate=endDate)

        # Observations have been processed to the nino34Index prior to reading
        dsObs = xr.open_dataset(dataPath, decode_cf=False, decode_times=False)
        # add the days between 0001-01-01 and the refDate so we have a new
        # reference date of 0001-01-01 (like for the model Time)
        dsObs["Time"] = dsObs.Time + \
            string_to_days_since_date(dateString=refDate, calendar=calendar)
        nino34Obs = dsObs.sst

        self.logger.info(
            '  Compute El Nino {} Index...'.format(ninoIndexNumber))
        varName = self.variableList[0]
        regionSST = ds[varName].isel(nOceanRegions=regionIndex)
        nino34Main = self._compute_nino34_index(regionSST, calendar)

        # Compute the observational index over the entire time range
        # nino34Obs = compute_nino34_index(dsObs.sst, calendar)

        self.logger.info(
            ' Computing El Nino {} power spectra...'.format(ninoIndexNumber))
        spectraMain = self._compute_nino34_spectra(nino34Main)

        # Compute the observational spectra over the whole record
        spectraObs = self._compute_nino34_spectra(nino34Obs)

        # Compute the observational spectra over the last 30 years for
        # comparison. Only saving the spectra
        subsetEndYear = 2016
        if self.controlConfig is None:
            subsetStartYear = 1976
        else:
            # make the subset the same length as the input data set
            subsetStartYear = subsetEndYear - (endYear - startYear)
        time_start = datetime_to_days(datetime.datetime(subsetStartYear, 1, 1),
                                      calendar=calendar)
        time_end = datetime_to_days(datetime.datetime(subsetEndYear, 12, 31),
                                    calendar=calendar)
        nino34Subset = nino34Obs.sel(Time=slice(time_start, time_end))
        spectraSubset = self._compute_nino34_spectra(nino34Subset)

        if self.controlConfig is None:
            nino34s = [nino34Obs[2:-3], nino34Subset, nino34Main[2:-3]]
            titles = [
                '{} (Full Record)'.format(obsTitle),
                '{} ({} - {})'.format(obsTitle, subsetStartYear,
                                      subsetEndYear), mainRunName
            ]
            spectra = [spectraObs, spectraSubset, spectraMain]
        else:
            baseDirectory = build_config_full_path(self.controlConfig,
                                                   'output',
                                                   'timeSeriesSubdirectory')

            refFileName = '{}/{}.nc'.format(
                baseDirectory, self.mpasTimeSeriesTask.fullTaskName)

            dsRef = open_mpas_dataset(fileName=refFileName,
                                      calendar=calendar,
                                      variableList=self.variableList)

            regionSSTRef = dsRef[varName].isel(nOceanRegions=regionIndex)
            nino34Ref = self._compute_nino34_index(regionSSTRef, calendar)

            nino34s = [nino34Subset, nino34Main[2:-3], nino34Ref[2:-3]]
            controlRunName = self.controlConfig.get('runs', 'mainRunName')

            spectraRef = self._compute_nino34_spectra(nino34Ref)

            titles = [
                '{} ({} - {})'.format(obsTitle, subsetStartYear,
                                      subsetEndYear), mainRunName,
                'Control: {}'.format(controlRunName)
            ]
            spectra = [spectraSubset, spectraMain, spectraRef]

        # Convert frequencies to period in years
        for s in spectra:
            s['period'] = \
                1.0 / (constants.eps + s['f'] * constants.sec_per_year)

        self.logger.info(
            ' Plot El Nino {} index and spectra...'.format(ninoIndexNumber))

        outFileName = '{}/nino{}_{}.png'.format(self.plotsDirectory,
                                                ninoIndexNumber, mainRunName)
        self._nino34_timeseries_plot(
            nino34s=nino34s,
            title=u'El Niño {} Index'.format(ninoIndexNumber),
            panelTitles=titles,
            outFileName=outFileName)

        self._write_xml(filePrefix='nino{}_{}'.format(ninoIndexNumber,
                                                      mainRunName),
                        plotType='Time Series',
                        ninoIndexNumber=ninoIndexNumber)

        outFileName = '{}/nino{}_spectra_{}.png'.format(
            self.plotsDirectory, ninoIndexNumber, mainRunName)
        self._nino34_spectra_plot(
            spectra=spectra,
            title=u'El Niño {} power spectrum'.format(ninoIndexNumber),
            panelTitles=titles,
            outFileName=outFileName)

        self._write_xml(filePrefix='nino{}_spectra_{}'.format(
            ninoIndexNumber, mainRunName),
                        plotType='Spectra',
                        ninoIndexNumber=ninoIndexNumber)
예제 #5
0
def _parse_dataset_time(ds,
                        inTimeVariableName,
                        calendar,
                        outTimeVariableName='Time',
                        referenceDate='0001-01-01'):  # {{{
    """
    A helper function for computing a time coordinate from an MPAS time
    variable.  Given a data set and a time variable name (or list of 2
    time names), returns a new data set with time coordinate
    `outTimeVariableName` filled with days since `referenceDate`

    Parameters
    ----------
    ds : ``xarray.DataSet``
        The data set containing an MPAS time variable to be used to build
        an xarray time coordinate.

    inTimeVariableName : str or tuple or list of str
        The name of the time variable in the MPAS data set that will be
        used to build the 'Time' coordinate.  The array(s) named by
        inTimeVariableName should contain date strings.  Typically,
        inTimeVariableName is ``'xtime'``. If a list of two variable
        names is provided, times from the two are averaged together to
        determine the value of the time coordinate.  In such cases,
        inTimeVariableName is typically
        ``['xtime_startMonthly', 'xtime_endMonthly']``.

    calendar : {'gregorian', 'gregorian_noleap'}
        The name of one of the calendars supported by MPAS cores

    outTimeVariableName : str
        The name of the coordinate to assign times to, typically 'Time'.

    referenceDate : str, optional
        The reference date for the time variable, typically '0001-01-01',
        taking one of the following forms::

            0001-01-01
            0001-01-01 00:00:00

    Returns
    -------
    dsOut : ``xarray.DataSet``
        A copy of the input data set with the `outTimeVariableName`
        coordinate containing the time coordinate parsed from
        `inTimeVariableName`.

    Raises
    ------
    TypeError
        If the time variable has an unsupported type (not a date string
        or a floating-pont number of days since the start of the simulatio).
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    if isinstance(inTimeVariableName, (tuple, list)):
        # we want to average the two
        assert (len(inTimeVariableName) == 2)

        dsStart = _parse_dataset_time(ds=ds,
                                      inTimeVariableName=inTimeVariableName[0],
                                      calendar=calendar,
                                      outTimeVariableName=outTimeVariableName,
                                      referenceDate=referenceDate)
        dsEnd = _parse_dataset_time(ds=ds,
                                    inTimeVariableName=inTimeVariableName[1],
                                    calendar=calendar,
                                    outTimeVariableName=outTimeVariableName,
                                    referenceDate=referenceDate)
        starts = dsStart[outTimeVariableName].values
        ends = dsEnd[outTimeVariableName].values

        # replace the time in starts with the mean of starts and ends
        dsOut = dsStart.copy()

        dsOut.coords['startTime'] = (outTimeVariableName, starts)
        dsOut.coords['endTime'] = (outTimeVariableName, ends)

        dsOut.coords[outTimeVariableName] = (outTimeVariableName, [
            starts[i] + (ends[i] - starts[i]) / 2 for i in range(len(starts))
        ])

    else:
        # there is just one time variable (either because we're recursively
        # calling the function or because we're not averaging).

        timeVar = ds[inTimeVariableName]

        if timeVar.dtype != '|S64':
            raise TypeError("timeVar of unsupported type {}.  String variable "
                            "expected.".format(timeVar.dtype))

        # this is an array of date strings like 'xtime'
        # convert to string
        timeStrings = [
            ''.join(xtime.astype('U')).strip() for xtime in timeVar.values
        ]
        days = string_to_days_since_date(dateString=timeStrings,
                                         referenceDate=referenceDate,
                                         calendar=calendar)

        dsOut = ds.copy()
        dsOut.coords[outTimeVariableName] = (outTimeVariableName, days)

    return dsOut  # }}}