Exemplo n.º 1
0
    def to_array(self, ts, start_date=None, end_date=None, month_index=None, include_months=None, include_missing=True,
                 match_other_nonmissing=None, paired_ts=None, return_type=None):
        """
        Return an array containing the data values of the time series for the specified
        period.  If the start date or end date are outside the period of
        record for the time series, use the missing data value from the time series
        for those values.  If the start date or end date are null, the start and end
        dates of the time series are used.  This is a utility routine mainly used by other versions of this routine.
        @return The array of data for the time series.  If an error, return null.
        @param ts Time series to convert data to array format.
        @param start_date Date corresponding to the first date of the returned array.
        @param end_date Date corresponding to the last date of the returned array.
        @param month_index Month of interest (1=Jan, 12=Dec).  If zero, process all months.
        @param include_months
        @param include_missing indicate whether missing values should be included in the result.
        @param match_other_nonmissing
        @param paired_ts
        @param return_type
        """
        if month_index == None:
            # Called with no month index
            month_indices = None
            if month_index != 0:
                month_indices = [month_index]
            # Recursively call
            return self.to_array(ts, start_date=start_date, end_date=end_date, include_months=include_months,
                                 include_missing=include_missing)

        # If here do the processing based on input arguments

        if paired_ts is not None:
            if not TimeInterval.is_regular_interval(ts.get_data_interval_base()):
                # throw new IrregularTimeSeriesNotSupportedException(
                raise ValueError(
                    "Irregular interval time series cannot have data array extracted using paired time series.")
            if not self.intervals_match(ts, paired_ts):
                # throw new UnequalTimeIntervalException(
                raise ValueError(
                    "Time series from which to extract data has a different interval than paired time series.")
        # Get valid dates because the ones passed in may have been null...

        valid_dates = self.get_valid_period(ts, start_date, end_date)
        start = valid_dates.get_date1()
        end = valid_dates.get_date2()

        interval_base = ts.get_data_interval_base()
        interval_mult = ts.get_data_interval_mult()
        size = 0
        # if ts.get_data_interval_base() == TimeInterval.IRREGULAR:
            # size = self.calculate_data_size(ts, start, end)
        # else:
        size = self.calculate_data_size(start, end, interval_base, interval_mult)
        if return_type is None:
            return_type = TSToArrayReturnType.DATA_VALUE
        if return_type == TSToArrayReturnType.DATE_TIME:
            # Only 1Year, 1Month, 1Day intervals are supported
            if (interval_mult != 1) or ((interval_base != TimeInterval.YEAR) and
                (interval_base != TimeInterval.YEAR) and (interval_base != TimeInterval.YEAR)):
                # throw new InvalidTimeIntervalException(
                raise ValueError(
                    "Interval must be Year, Month, or Day (no multiplier) to return date/time as array.")

        include_months_mask = []
        if (include_months is None) or (len(include_months) == 0):
            for i in range(12):
                include_months_mask[i] = True
        else:
            for i in range(12):
                include_months_mask[i] = False
            for i in range(len(include_months)):
                include_months_mask[include_months[i] - 1] = True

        if size == 0:
            return []

        data_array = []  # Initial size including missing
        count = 0  # Number of values in array.
        month = 0  # Month

        if interval_base == TimeInterval.IRREGULAR:
            # Get the data and loop through the vector...
            irrts = ts
            alltsdata = irrts.get_data()
            if alltsdata is None:
                # No data for the time series...
                return None
            nalltsdata = len(alltsdata)
            tsdata = None
            date = None
            for i in range(nalltsdata):
                tsdata = alltsdata[i]
                date = tsdata.get_date()
                if date.greater_than(end):
                    # Past the end of where we want to go so quit...
                    break
                if date.greater_than_or_equal_to(start):
                    month = date.get_month()
                    if include_months_mask[month - 1]:
                        value = tsdata.get_data_value()
                        if include_missing or not ts.is_data_missing(value):
                            if return_type == TSToArrayReturnType.DATA_VALUE:
                                data_array[count] = value
                                count += 1
                            elif return_type == TSToArrayReturnType.DATE_TIME:
                                if interval_base == TimeInterval.YEAR:
                                    data_array[count] = date.get_year()
                                    count += 1
                                elif interval_base == TimeInterval.MONTH:
                                    data_array[count] = date.get_absolute_month()
                                    count += 1
                                elif interval_base == TimeInterval.DAY:
                                    data_array[count] = date.get_absolute_day()
                                    count += 1
        else:
            # Regular, increment the data by interval...
            date = DateTime(date_time=start)
            count = 0
            do_transfer = False
            is_missing = False
            # for ; date.lessThanOrEqualTo( end); date.addInterval(interval_base, interval_mult):
            first_iteration = True
            while date.less_than_or_equal_to(end):
                if first_iteration:
                    first_iteration = False
                else:
                    date.add_interval(interval_base, interval_mult)
                # First figure out if the data should be skipped because not in a requested month
                month = date.get_month()
                if not include_months_mask[month - 1]:
                    continue
                # Now transfer the value while checking the paired time series
                do_transfer = False  # Do not transfer unless criteria are met below
                value = ts.get_data_value(date)
                is_missing = ts.is_data_missing(value)
                if paired_ts is not None:
                    # Value in "ts" time series MUST be non-missing
                    if not is_missing:
                        value2 = paired_ts.get_data_value(date)
                        is_missing2 = paired_ts.is_data_missing(value2)
                        if match_other_nonmissing:
                            # Want non-missing in both "ts" and "pairedTS"
                            if not is_missing2:
                                do_transfer = True
                        else:
                            # Want non-missing in "ts" and missing in "pairedTS"
                            if is_missing2:
                                do_transfer = True
                else:
                    if include_missing or not is_missing:
                        # Value is not missing.
                        do_transfer = True

                # OK to transfer the value...
                if do_transfer:
                    if return_type == TSToArrayReturnType.DATA_VALUE:
                        data_array[count] = value
                        count += 1
                    elif return_type == TSToArrayReturnType.DATE_TIME:
                        if interval_base == TimeInterval.YEAR:
                            data_array[count] = date.get_year()
                            count += 1
                        elif interval_base == TimeInterval.MONTH:
                            data_array[count] = date.get_absolute_month()
                            count += 1
                        elif interval_base == TimeInterval.DAY:
                            # TODO smalers 2020-01-04 need to enable
                            # data_array[count] = date.get_absolute_day()
                            count += 1

        if count != size:
            # The original array is too big and needs to be cut down to the exact size due to limited
            # months or missing data being excluded)...
            new_data_array = [count]
            for j in range(count):
                new_data_array[j] = data_array[j]
            return new_data_array

        # Return the full array...
        return data_array
Exemplo n.º 2
0
    def calculate_data_limits(self, ts, start0, end0, refresh_flag):
        """
        Calculate the total data limits for a time series between two dates.
        This code was taken from the TSUtil.getDataLimits method.
        @param ts Time series of interest.
        @param start0 Starting date for the check.
        @param end0 Ending date for the check.
        @param refresh_flag Indicates whether the time series should be refreshed first
        (in general this is used only within the TS package and the version of this
        routine without the flag should be called).
        """
        max = 1.0
        mean = 0.0
        min = 0.0
        sum = 0.0
        value = 0.0
        base = 0
        missing_count = 0
        mult = 0
        non_missing_count = 0
        found = False
        max_date = None
        min_date = None
        non_missing_data_date1 = None
        non_missing_data_date2 = None
        t = None

        logger = logging.getLogger(__name__)
        debug = False

        try:
            # Main try...
            if ts is None:
                message = "NULL time series"
                logger.warning(message)
                # throw new TSException ( message )
                raise ValueError(message)

            # Initialize the sum and the mean...

            missing = ts.get_missing()
            sum = missing
            mean = missing

            # Get valid date limits because the ones passed in may have been null...

            valid_dates = self.get_valid_period(ts, start0, end0)
            start = valid_dates.get_date1()
            end = valid_dates.get_date2()
            valid_dates = None

            # Make sure that the time series has current limits...

            base = ts.get_data_interval_base()
            mult = ts.get_data_interval_mult()
            if refresh_flag:
                # Force a refresh of the time series.
                ts.refresh()

            # Get the variables that are used often in this function.

            ts_date1 = ts.get_date1()
            ts_date2 = ts.get_date2()

            # Figure out if we are treating data <= 0 as missing...

            ignore_lezero = False
            if (self.flags & TSLimits.IGNORE_LESS_THAN_OR_EQUAL_ZERO) != 0:
                ignore_lezero = True

            # Loop through the dates and get max and min data values
            # TODO SAM 2010-06-15 Need to consolidate code to use iterator

            if base == TimeInterval.IRREGULAR:
                # Loop through the dates and get max and min data values
                # Need to cast as an irregular TS...

                # IrregularTS its = (IrregularTS)ts
                its = ts

                data_array = its.get_data
                if data_array is None:
                    message = "Null data for " + str(ts)
                    logger.warning(message)
                    # throw new TSException ( message )
                    raise ValueError(message)
                size = len(data_array)
                ptr = None
                for i in range(size):
                    ptr = data_array[i]
                    date = ptr.get_date()

                    if date.less_than(ts_date1):
                        # Still looking for data...
                        continue
                    elif date.greater_than(ts_date2):
                        # No need to continue processing...
                        break

                    value = ptr.get_data_value()

                    if ts.is_data_missing(value) or (ignore_lezero and (value <= 0.0)):
                        # The value is missing
                        missing_count += 1
                        continue

                    # Else, data value is not missing...

                    if ts.is_data_missing(sum):
                        # Reset the sum...
                        sum = value
                    else:
                        # Add to the sum...
                        sum += value
                    non_missing_count += 1

                    if found:
                        # Already found the first non-missing point so
                        # all we need to do is check the limits.  These
                        # should only result in new DateTime a few times...
                        if value > max:
                            max = value
                            max_date = DateTime(date_time=date)
                        if value < min:
                            min = value
                            min_date = DateTime(date_time=date)
                    else:
                        # Set the limits to the first value found...
                        # date = new DateTime ( t )
                        max = value
                        max_date = DateTime(date_time=date)
                        min = value
                        min_date = max_date
                        non_missing_data_date1 = max_date
                        non_missing_data_date2 = max_date
                        found = True
                        continue

                # Now search backwards to find the first non-missing date...

                if found:
                    for i in range((size - 1), 0, -1):
                        ptr = data_array[i]
                        date = ptr.get_date()
                        value = ptr.get_data_value()
                        if date.greater_than(end):
                            # Have not found data...
                            continue
                        elif date.less_than(start):
                            # Passed start...
                            break
                        if (not ignore_lezero and not ts.is_data_missing(value)) or \
                                (ignore_lezero and ((value > 0.0) and not ts.is_data_missing(value))):
                            # Found the one date we are after...
                            non_missing_data_date2 = DateTime(date_time=date)
                            break
            else:
                # A regular TS... easier to iterate...
                # First loop through and find the data limits and the minimum non-missing date...
                t = DateTime(date_time=start, flag=DateTime.DATE_FAST)
                # Python for loops are not as clean as original Java code
                # for ( ; t.lessThanOrEqualTo(end); t.addInterval( base, mult )) {
                first_iteration = True
                while t.less_than_or_equal_to(end):
                    if first_iteration:
                        first_iteration = False
                    else:
                        t.add_interval(base, mult)

                    value = ts.get_data_value(t)

                    if ts.is_data_missing(value) or (ignore_lezero and (value <= 0.0)):
                        # The value is missing
                        missing_count += 1
                        continue

                    # Else, data value is not missing...

                    if ts.is_data_missing(sum):
                        # Reset the sum...
                        sum = value
                    else:
                        # Add to the sum...
                        sum += value
                    non_missing_count += 1

                    if found:
                        # Already found the first non-missing point so
                        # all we need to do is check the limits.  These
                        # should only result in new DateTime a few times...
                        if value > max:
                            max = value
                            max_date = DateTime(date_time=t)
                        if value < min:
                            min = value
                        min_date = DateTime(date_time=t)
                    else:
                        # First non-missing point so set the initial values...
                        date = DateTime(date_time=t)
                        max = value
                        max_date = date
                        min = value
                        min_date = date
                        non_missing_data_date1 = date
                        non_missing_data_date2 = date
                        found = True
                # Now loop backwards and find the last non-missing value...
                t = DateTime(date_time=end, flag=DateTime.DATE_FAST)
                if found:
                    # for(; t.greaterThanOrEqualTo(start); t.addInterval( base, -mult )) {
                    first_iteration = True
                    while t.greater_than_or_equal_to(start):
                        if first_iteration:
                            first_iteration = False
                        else:
                            t.add_interval(base, -mult)
                        value = ts.get_data_value(t)
                        if (not ignore_lezero and not ts.is_data_missing(value)) or \
                                (ignore_lezero and ((value > 0.0) and not ts.is_data_missing(value))):
                            # The value is not missing...
                            non_missing_data_date2 = DateTime(date_time=t)
                            break

            # TODO SAM 2010-06-15 This is a performance hit, but not too bad
            # TODO SAM 2010-06-15 Consider treating other statistics similarly but need to define unit tests
            # TODO SAM 2010-06-15 This code would need to be changed if doing Lag-1 correlation because order matters
            # For newly added statistics, use helper method to get data, ignoring missing...
            data_array = self.to_array(ts, start, end, 0, False)
            # Check for <= 0 values if necessary
            n_data_array = len(data_array)
            if ignore_lezero:
                for i in range(n_data_array):
                    if data_array[i] <= 0.0:
                        # Just exchange with the last value and reduce the size
                        temp = data_array[i]
                        data_array[i] = data_array[n_data_array - 1]
                        data_array[n_data_array - 1] = temp
                        n_data_array -= 1

            if n_data_array > 0:
                self.set_median(MathUtil.median(n_data_array, data_array))

            if n_data_array > 1:
                try:
                    self.set_std_dev(MathUtil.standard_deviation(n_data_array, data_array))
                except Exception as e:
                    # Likely due to small sample size
                    pass
            if n_data_array > 2:
                try:
                    self.set_skew(MathUtil.skew(n_data_array, data_array))
                except Exception as e:
                    # Likely due to small sample size
                    pass

            if not found:
                message = "\"" + ts.getIdentifierString() + "\": problems finding limits, whole POR missing!"
                logger.warning(message)
                # throw new TSException ( message )
                raise ValueError(message)

            if debug:
                logger.debug("Overall date limits are: " + str(start) + " to " + str(end))
                logger.debug("Found limits to be: " + str(min) + " on " + str(min_date) + " to " + str(max) +
                             " on " + str(max_date))
                logger.debug("Found non-missing data dates to be: " + str(non_missing_data_date1) + " -> " +
                             str(non_missing_data_date2))

            # Set the basic information...

            self.set_date1(start)
            self.set_date2(end)
            self.set_max_value(max, max_date)
            self.set_min_value(min, min_date)
            self.set_non_missing_data_date1(non_missing_data_date1)
            self.set_non_missing_data_date2(non_missing_data_date2)
            self.set_missing_data_count(missing_count)
            self.set_non_missing_data_count(non_missing_count)
            # //int data_size = calculate_data_size(ts, start, end)
            # //limits.set_non_missing_data_count(data_size - missing_count)
            if not ts.is_data_missing(sum) and (non_missing_count > 0):
                mean = sum/float(non_missing_count)
            else:
                mean = missing
            self.set_sum(sum)
            self.set_mean(mean)
        except Exception as e:
            message = "Error computing limits."
            logger.warning(message)
            # Put in debug because output sometimes is overwhelming when data are not available.
            if debug:
                logger.warning(e)
            # throw new TSException ( message )
            raise Exception(message)