def concat(dfs: List[pd.DataFrame]) -> List[pd.DataFrame]: freqs = [pd.infer_freq(df.index) for df in dfs] if all(freq == freqs[0] for freq in freqs): combined = pd.concat(dfs, axis=1) else: for freq_opt in ["A-DEC", "A", "Q-DEC", "Q", "M", "2W-SUN", "W-SUN"]: if freq_opt in freqs: output = [] for df in dfs: freq_df = pd.infer_freq(df.index) if freq_df == freq_opt: df_match = df.copy() else: type_df = df.columns.get_level_values("Tipo")[0] unit_df = df.columns.get_level_values("Unidad")[0] if type_df == "Stock": df_match = transform.resample(df, rule=freq_opt, operation="last") elif type_df == "Flujo" and not any( x in unit_df for x in ["%", "=", "Cambio"]): df_match = transform.resample(df, rule=freq_opt, operation="sum") else: df_match = transform.resample(df, rule=freq_opt, operation="mean") output.append(df_match) combined = pd.concat(output, axis=1) break else: continue return combined
def test_shift_ruptures_shift_min(midday): shifted = _shift_between( midday, 30, start='2020-01-01', end='2020-01-25', ) shift_expected = pd.Series(0, index=shifted.index, dtype='int64') shift_expected.loc['2020-01-01':'2020-01-25'] = 30 no_shift = pd.Series(0, index=shifted.index, dtype='int64') shift_mask, shift_amount = time.shifts_ruptures(shifted, midday, shift_min=60, round_up_from=40) assert not shift_mask.any() assert_series_equal(shift_amount, no_shift, check_names=False) shift_mask, shift_amount = time.shifts_ruptures(shifted, midday, shift_min=30) assert_series_equal( shift_mask, shift_expected != 0 if pd.infer_freq(shifted.index) != 'H' else False, check_names=False) assert_series_equal( shift_amount, shift_expected if pd.infer_freq(shifted.index) != 'H' else no_shift, check_names=False)
def select_same_time_slice(reference_ds, ds): """ Select the values for the same timestep as the """ # CHECK THEY ARE THE SAME FREQUENCY # get the frequency of the time series from reference_ds freq = pd.infer_freq(reference_ds.time.values) old_freq = pd.infer_freq(ds.time.values) assert freq == old_freq, f"The frequencies should be the same! currenlty ref: {freq} vs. old: {old_freq}" # get the STARTING time point from the reference_ds min_time = reference_ds.time.min().values max_time = reference_ds.time.max().values orig_time_range = pd.date_range(min_time, max_time, freq=freq) # EXTEND the original time_range by 1 (so selecting the whole slice) # because python doesn't select the final in a range periods = len(orig_time_range) + 1 # create new time series going ONE EXTRA PERIOD new_time_range = pd.date_range(min_time, freq=freq, periods=periods) new_max = new_time_range.max() # select using the NEW MAX as upper limit ds = ds.sel(time=slice(min_time, new_max)) # assert reference_ds.time.shape[0] == ds.time.shape[0],"The time dimensions should match, currently reference_ds.time dims {reference_ds.time.shape[0]} != ds.time dims {ds.time.shape[0]}" print_time_min = pd.to_datetime(ds.time.min().values) print_time_max = pd.to_datetime(ds.time.max().values) try: vars = [i for i in ds.var().variables] except: vars = ds.name ref_vars = [i for i in reference_ds.var().variables] print(f"Select same timeslice for ds with vars: {vars}. Min {print_time_min} Max {print_time_max}") return ds
def infer_freq(da): """Infer temporal resolution of a dataset. Parameters ---------- da : xarray.DataArray DataArray to process. Returns ------- str Inferred temporal resolution. """ # If the data is uniformally spaced in time (eg, hourly, daily), the # temporal resolution is instantly inferred by pandas / xarray. #if encoding in ['noleap', 'all_leap', '365_day', '366_day', '360_day']: # pass idx = da.indexes['time'] freq = pd.infer_freq(idx) if not freq: # Because input might be seasonalized, it may no longer be uniform # in space. In that case, we will assume the data follows a pattern # and extrapolate from the first three time steps. We should try to # use a better approach for this in the future. idx = idx.to_series().apply(lambda dt: dt.replace(day=1, hour=0, minute=0)) freq = pd.infer_freq(idx[:3]) if freq: return freq else: raise ValueError('Could not infer frequency.')
def load_1min_gwangali_sitewise(): """ loads 1 minute data from 2 sites close to gwangali. This does not contain wat temp, tide and salinity""" _d_dir = os.path.join(os.getcwd(), 'data\\AWS_data\\site_wise') _files = [f for f in os.listdir(_d_dir) if f.endswith('txt')] a_files, b_files = [], [] for f in _files: if f.split('.')[0].endswith("_a"): a_files.append(f) elif f.split('.')[0].endswith('_b'): b_files.append(f) haupt_df = pd.DataFrame() for af, bf in zip(a_files, b_files): _f = os.path.join(_d_dir, af) _df = pd.read_csv(_f) _df.index = pd.to_datetime(_df['Date_Time1']) _df.index.freq = pd.infer_freq(_df.index) if _df.index.freq is None: _f = os.path.join(_d_dir, bf) _df = pd.read_csv(_f) _df.index = pd.to_datetime(_df['Date_Time2']) _df.index.freq = pd.infer_freq(_df.index) print(_df.index.freq, ' taken from ', bf) print(_df.index.freq) haupt_df = pd.concat([haupt_df, _df]) return haupt_df
def _bias_correct(self, add_error_scale, mul_error_scale, frcst_matrix, freq): frcst_matrix = frcst_matrix.resample(freq).sum() frcst_matrix = frcst_matrix.astype(np.float) # Checks the frequency of error scales are of the same as given freq. add_error_freq = pd.infer_freq(add_error_scale.index, warn=True) mul_error_freq = pd.infer_freq(mul_error_scale.index, warn=True) if add_error_freq != freq: raise ValueError("Error scale frequency is not same as the passed freq: %s" % freq, add_error_freq) if mul_error_freq != freq: raise ValueError("Error scale frequency is not same as the passed freq: %s" % freq, mul_error_freq) # blancket corrector function def blanket_corrector(series, err_scl): try: add_err = float(err_scl.at[series.name, 'value']) except Exception as ex: print(ex) return series bring_forward = 0.0 corr_vals = [] keys = [] for index, value in series.iteritems(): correction = value + (bring_forward + add_err) if correction >= 0: corr_vals.append(correction) keys.append(index) bring_forward = 0.0 else: corr_vals.append(0.0) keys.append(index) bring_forward = correction return pd.Series(data=corr_vals, index=keys, dtype=np.float).rename(series.name) # multiplier corrector function def multiplier_corrector(series, err_scl): try: mul_err = float(err_scl.at[series.name, 'value']) except Exception as ex: print(ex) return series corr_vals = [] keys = [] for index, value in series.iteritems(): correction = value * mul_err if correction >= 0: corr_vals.append(correction) keys.append(index) else: corr_vals.append(value) keys.append(index) return pd.Series(data=corr_vals, index=keys, dtype=np.float).rename(series.name) return frcst_matrix.apply(blanket_corrector, axis='columns', raw=False, err_scl=add_error_scale)
def fill_missing_values(frame, inferred_feq: datetime.timedelta = None): if len(frame) == 1: return frame freq = pandas.infer_freq(frame.index) if freq: if not frame.index.freq: frame.set_index(pandas.DatetimeIndex(frame.index.values, freq=freq), inplace=True) return frame if not inferred_feq: counter = collections.Counter() index_it = iter(frame.index) prev_value = next(index_it, None) if prev_value: for current_value in index_it: time_distance = current_value - prev_value counter[time_distance] += 1 prev_value = current_value inferred_feq = counter.most_common(1)[0][0] start_index = frame.index.min() end_index = frame.index.max() missing_values = [] current_index = start_index while current_index < end_index: if current_index not in frame.index: missing_values.append(current_index) current_index += inferred_feq percentage_missing_values = float( len(missing_values)) / (frame.shape[0] + len(missing_values)) if percentage_missing_values > WeatherCache.PERCENTAGE_MISSING_VALUES_THRESHOLD: warnings.warn( 'Missing values constitute {0:.2f}% of all values in the frame which exceeds {1}% threshold' .format( percentage_missing_values * 100.0, WeatherCache.PERCENTAGE_MISSING_VALUES_THRESHOLD * 100.0)) data = numpy.full((len(missing_values), len(frame.columns)), numpy.NaN) missing_values_frame = pandas.DataFrame(index=missing_values, data=data, columns=frame.columns) filled_frame = frame.append(missing_values_frame) filled_frame.sort_index(inplace=True) filled_frame.fillna(method='ffill', inplace=True) freq = pandas.infer_freq(filled_frame.index) if freq: filled_frame.set_index(pandas.DatetimeIndex( filled_frame.index.values, freq=freq), inplace=True) return filled_frame
def __init__(self, df: pd.DataFrame = None, tz: str = None, units: Union[str, list] = None, name: str = "") -> None: """ Initializes the Market. """ # Deal with DataFrame if (df is None) or (df.empty is True): self.data = pd.DataFrame(index=None, data=None) self.start_utc = None self.end_utc = None self.dims = (0, 0) self.freq = None self.name = 'Empty Market' else: # Extract values if type(df.index[0]) == 'str': new_index = pd.to_datetime(df.index, format=fmt) self.data = pd.DataFrame(index=new_index, data=df.values) self.start_utc = datetime.strptime(str(new_index[0]), fmt) self.end_utc = datetime.strptime(str(new_index[-1]), fmt) self.dims = df.shape try: self.freq = pd.infer_freq(new_index) except: self.freq = 'Unknown' self.name = name else: self.data = df self.start_utc = df.index[0] self.end_utc = df.index[-1] self.dims = df.shape try: self.freq = pd.infer_freq(df.index) except: self.freq = 'Unknown' self.name = name # Deal with unit if units is None: self.units = None else: assert (len(units) == len(self.data.columns)) self.units = units # Deal with timezone if tz is None: self.tz = 'UTC' self.timezone = pytz.utc else: self.tz = tz self.timezone = pytz.timezone(tz)
def shift_dates(self, h): """ Auxiliary function for creating dates for forecasts Parameters ---------- h : int How many steps to forecast Returns ---------- A transformed date_index object """ date_index = copy.deepcopy(self.index) date_index = date_index[self.max_lag:len(date_index)] if self.is_pandas is True: if isinstance(date_index, pd.tseries.index.DatetimeIndex): if pd.infer_freq(date_index) == 'H' or pd.infer_freq( date_index) == 'M' or pd.infer_freq(date_index) == 'S': for t in range(h): date_index += pd.DateOffset( (date_index[len(date_index) - 1] - date_index[len(date_index) - 2]).seconds) else: # Assume higher frequency (configured for days) for t in range(h): date_index += pd.DateOffset( (date_index[len(date_index) - 1] - date_index[len(date_index) - 2]).days) elif isinstance(date_index, pd.core.index.Int64Index): for i in range(h): new_value = date_index.values[ len(date_index.values) - 1] + (date_index.values[len(date_index.values) - 1] - date_index.values[len(date_index.values) - 2]) date_index = pd.Int64Index( np.append(date_index.values, new_value)) else: for t in range(h): date_index.append(date_index[len(date_index) - 1] + 1) return date_index
def select_same_time_slice(reference_ds, ds): """ Select the values for the same timestep as the reference ds""" # CHECK THEY ARE THE SAME FREQUENCY # get the frequency of the time series from reference_ds freq = pd.infer_freq(reference_ds.time.values) if freq == None: warnings.warn("HARDCODED FOR THIS PROBLEM BUT NO IDEA WHY NOT WORKING") freq = "M" # assert False, f"Unable to infer frequency from the reference_ds timestep" old_freq = pd.infer_freq(ds.time.values) warnings.warn( "Disabled the assert statement. ENSURE FREQUENCIES THE SAME (e.g. monthly)" ) # assert freq == old_freq, f"The frequencies should be the same! currenlty ref: {freq} vs. old: {old_freq}" # get the STARTING time point from the reference_ds min_time = reference_ds.time.min().values max_time = reference_ds.time.max().values orig_time_range = pd.date_range(min_time, max_time, freq=freq) # EXTEND the original time_range by 1 (so selecting the whole slice) # because python doesn't select the final in a range periods = len(orig_time_range) # + 1 # create new time series going ONE EXTRA PERIOD new_time_range = pd.date_range(min_time, freq=freq, periods=periods) new_max = new_time_range.max() # select using the NEW MAX as upper limit # -------------------------------------------------------------------------- # FOR SOME REASON slice is removing the minimum time ... # something to do with the fact that matplotlib / xarray is working oddly with numpy64datetime object warnings.warn("L153: HARDCODING THE MIN VALUE OTHERWISE IGNORED ...") min_time = datetime.datetime(2001, 1, 31) # -------------------------------------------------------------------------- ds = ds.sel(time=slice(min_time, new_max)) assert ( reference_ds.time.shape[0] == ds.time.shape[0] ), f"The time dimensions should match, currently reference_ds.time dims {reference_ds.time.shape[0]} != ds.time dims {ds.time.shape[0]}" print_time_min = pd.to_datetime(ds.time.min().values) print_time_max = pd.to_datetime(ds.time.max().values) try: vars = [i for i in ds.var().variables] except: vars = ds.name # ref_vars = [i for i in reference_ds.var().variables] print( f"Select same timeslice for ds with vars: {vars}. Min {print_time_min} Max {print_time_max}" ) return ds
def calculate_mas(data_df, periods): for period in periods: data_df['ema' + str(period)] = data_df['close'].ewm(span=period).mean() data_df['ma' + str(period)] = data_df['close'].rolling(period).mean() if pd.infer_freq(data_df.index).split('H')[0] == '': freq = 1 else: freq = float(pd.infer_freq(data_df.index).split('H')[0]) #in hours data_df['200dma'] = data_df['close'].rolling(int(200 * 24.0 / freq)).mean() data_df['bull'] = data_df['close'] > data_df['200dma'] return data_df
def data_freq(time_series): """ Determine frequency of given time series Args: time_series (Series): Series with datetime index Returns: string: frequency specifier """ try: freq = time_series.index.freq return freq.freqstr or pd.infer_freq(time_series.index) except AttributeError: return pd.infer_freq(time_series.index)
def infer_or_inject_freq(df, injected_freq='1s', start_date=None, **kwargs): """ Infer index frequency. If there's not a proper time index, create fake timestamps, keeping the desired `injected_freq`. If that is None, set a default one of 1 second. start_date: the first date of the index (int or string). """ inferred_freq = pd.infer_freq(df.index) if inferred_freq == 'N': timedelta = pd.to_timedelta(injected_freq) df.index = pd.to_datetime(ifnone(start_date, 0), ** kwargs) + timedelta * df.index df.index.freq = pd.infer_freq(df.index) else: df.index.freq = inferred_freq return df
def merge(self, ts: 'TimeSeries') -> 'TimeSeries': """Merge two time series and make sure all the given indexes are sorted. Args: ts: the TimeSeries to merge with self Returns: TimeSeries """ # append and infer new freq merged = self.series.append(ts.series) infer_freq(merged.index) # instanciate a TimeSeries to sort it return TimeSeries(merged, self.metadata)
def DINGO_df_to_data_structure(file_in, var_list = None, fill_missing_with_nan = True, output_structure = None, return_global_attr = False): df = pd.read_pickle(file_in) time_step = pd.infer_freq(df.index) attr_dict = {'time_step': int(time_step[: len(time_step) - 1])} all_var_list = df.columns if var_list == None: var_list = all_var_list else: if not isinstance(var_list, list): var_list = [var_list] if output_structure == 'pandas': data_structure = df[var_list] else: data_dict = {'date_time': np.array([pd.Timestamp(rec).to_datetime() for rec in df.index])} for var in var_list: data_dict[var] = np.array(df[var]) data_structure = data_dict if return_global_attr: return data_structure, attr_dict else: return data_structure
def get_freq(X): if isinstance(X.index, pd.MultiIndex): freq = get_freq_multi_idx(X) else: freq = to_offset(pd.infer_freq(X.index)) return freq
def change_time_zone(ts, tz): """Convert hourly time series to new time zone. UTC is assumed if no time zone is assigned to the input time series. :param pandas.DataFrame/pands.Series ts: time series. :param str tz: new time zone. :return: (*pandas.DataFrame/pandas.Series*) -- time series with new time zone. :raises TypeError: if tz is not a str. :raises ValueError: if tz is invalid or the time series has already been resampled. """ _check_time_series(ts, "time series") if pd.infer_freq(ts.index) != "H": raise ValueError("frequency of time series must be 1h") if not isinstance(tz, str): raise TypeError("time zone must be a str") try: pytz.timezone(tz) except pytz.exceptions.UnknownTimeZoneError: raise ValueError("Unknown time zone %s" % tz) ts.index.name = tz if ts.index.tz is None: return ts.tz_localize("UTC").tz_convert(tz) else: return ts.tz_convert(tz)
def parse_data(df): if type(df) == pd.DataFrame: if df.shape[1] > 1: raise ValueError( "The dataframe should only contain one target column") elif type(df) == pd.Series: df = df.to_frame() else: raise TypeError( "Please supply a pandas dataframe with one column or a pandas series" ) try: df.index.date except AttributeError: raise TypeError("The index should be a datetype") print(type(df)) if df.isnull().any().values[0]: raise ValueError( "The dataframe cannot have any null values, please interpolate") try: df.columns = ["Target"] except: raise ValueError("There should only be one column") df.index = df.index.rename("Date") df.index = add_freq(df.index) print( "The data has been successfully parsed by infering a frequency, and establishing a 'Date' index and 'Target' column." ) return df, pd.infer_freq(df.index)
def _round_date_nearest_index(time_index_series, time_stamp): """ Internal helper function to round the date to the nearest DateTimeIndex in the DataFrame. Note: Prioritises the index in the same month first. Eg. if 1 Jan is given, and indexes 31 Dec and 31 Jan are available, it will wound to 31 Jan. If no valid date is found, it will just round it to the oldest or newest date, whichever is closer. """ for time in time_index_series: # If the timestamp and a time in index are equal, just return if time == time_stamp: return time # If the frequency of the series is yearly, just check same year if "as" in pd.infer_freq( time_index_series).lower() and time.year == time_stamp.year: print "Time given was rounded to the same year as an index." return time # Else assume monthly, check same month, same year elif time.month == time_stamp.month and time.year == time_stamp.year: print "Time given was rounded to the same month as an index." return time # If above all fail, just try to round it to the first or last value in the entire series if abs((time_stamp - time_index_series[0]).total_seconds()) > abs( (time_stamp - time_index_series[-1]).total_seconds()): print "Out of range so rounding to in range index" return time_index_series[-1] # This code should never be reached under normal circumstances, placed as a failsafe. print "Did not round" return time_index_series[0]
def join_pathname(df, inplace=False, a=None, c=None, e=None, f=None): """ Summary ------- Function to join pathname parts of CalSim tidy DataFrame into a "Pathname" column. """ # Initialize DataFrame for operation. df_out = df if inplace else df.copy() # Infer Part E, if not provided. if not e: inf_t_step = pd.infer_freq(df_out['DateTime'].unique()) e = variables.t_steps_inv[inf_t_step] if inf_t_step else None # Set column requirements. req_col = {'Part A': a, 'Part C': c, 'Part E': e, 'Part F': f} # Fill missing required columns. miss_col = list() for k, v in req_col.items(): if k not in df_out.columns: if v: df_out[k] = v else: miss_col.append(k) if miss_col: msg = 'Values required for the following columns: {}.' raise ValueError(msg.format(miss_col)) # Create "Pathname" column. construct_pathname = lambda x: r'/{}/{}/{}//{}/{}/'.format(*x.values) col_part = ['Part A', 'Part B', 'Part C', 'Part E', 'Part F'] df_out['Pathname'] = df_out[col_part].apply(construct_pathname, axis=1) # Drop pathname parts. df_out.drop(col_part, axis=1, inplace=True) # Return DataFrame. return df_out
def get_ytw_from_date(fromdate, srcfile=r'src/YTW-All-Values.xlsx'): ''' load data from source file into dataframe columns: Corp - corporate bond rate TB - treasury bond rate CS - credit spread Econ - economic data ''' import nb_credit_spread as cslibrary cslib = cslibrary.creditspread() return cslib.get_ytw_from_date_delta(srcfile=srcfile) import pandas as pd import cs_logger as cslog source_file = srcfile src_file = pd.read_excel(source_file, sheet_name='data', header=0, index_col='Date') ytw_df = pd.DataFrame(src_file) ytw_df = ytw_df.asfreq(pd.infer_freq( ytw_df.index)) # infer data frequency; monthly start_date = pd.to_datetime(fromdate) ytw_df = ytw_df[start_date:] # filter records by date cslog.debug(f"df: {ytw_df.head()}") return ytw_df
def restore_index(df, idx_meta, rowid_sort=True): """ restore index proper :param df: the dataframe :param idx_meta: index metadata :param rowid_sort: whether to sort by row id. defaults to True If your query is already sorted in some specific way, specify False to keep the sort order. """ # -- establish row order proper if rowid_sort and '_om#rowid' in df: df.sort_values('_om#rowid', inplace=True) del df['_om#rowid'] # -- get index columns index_cols = restore_index_columns_order(df.columns) # -- set index columns result = df.set_index(index_cols) if index_cols else df if index_cols: result.index.names = idx_meta.get('names', [None] * len(index_cols)) if isinstance(result.index, pd.DatetimeIndex): # restore datetime frequency, if possible if 'freq' in idx_meta: try: freq = idx_meta.get('freq') freq = freq or pd.infer_freq(result.index) result = result.asfreq(freq) except: pass return result
def _get_feats(index, ts, freq, scale=True, features=[ acf_features, arch_stat, crossing_points, entropy, flat_spots, heterogeneity, holt_parameters, lumpiness, nonlinearity, pacf_features, stl_features, stability, hw_parameters, unitroot_kpss, unitroot_pp, series_length, hurst ], dict_freqs=FREQS): if freq is None: freq = pd.infer_freq(ts['ds']) freq = dict_freqs[freq] if isinstance(ts, pd.DataFrame): assert 'y' in ts.columns ts = ts['y'].values if isinstance(ts, pd.Series): ts = ts.values if scale: ts = scalets(ts) c_map = ChainMap( *[dict_feat for dict_feat in [func(ts, freq) for func in features]]) return pd.DataFrame(dict(c_map), index=[index])
def test_seasonality_transformer( X_start, X_len, weekdays, weeks, months, quarter, year ): X = pd.DataFrame(index=pd.date_range(X_start, periods=X_len, freq="D")) y = pd.Series(np.arange(len(X)), name="values", index=X.index) freq = pd.infer_freq(X.index) df = pd.concat( [X, y, SeasonalityTransformer(freq=freq).fit(X, y).transform(X)], axis=1 ) assert set(weekdays).issubset(df.columns) assert set(weeks).issubset(df.columns) assert set(months).issubset(df.columns) assert set(quarter).issubset(df.columns) assert set(year).issubset(df.columns) first_row = df.head(1).T cols_with_ones = first_row[first_row[first_row.columns[0]] == 1].index single_date_cols = ( SeasonalityTransformer(freq=freq) .fit(X.head(1), y.head(1)) .transform(X.head(1)) .columns ) assert set(cols_with_ones) == set(single_date_cols)
def tsreg(ts, freq=None, interp=False): """ Function to regularize a time series object (pandas). The first three indeces must be regular for freq=None!!! Parameters ---------- ts : DataFrame pandas time series dataframe. freq : str or None Either specify the known frequency of the data or use None and determine the frequency from the first three indices. interp : bool Should linear interpolation be applied on all missing data? Returns ------- DataFrame """ if freq is None: freq = pd.infer_freq(ts.index[:3]) ts1 = ts.resample(freq).mean() if interp: ts1 = ts1.interpolate('time') return ts1
def tidy_to_wide(df): """ Summary ------- Transforms a copy of the input DataFrame from tidy to wide data format. """ # Ensure input DataFrame is in tidy format. if not validation.is_tidy(df): msg = 'Cannot transform DataFrame from tidy format to wide format.' raise TypeError(msg) # Initialize DataFrame for operation. df_out = df.copy() # Split Pathname into Parts. split_pathname(df_out, inplace=True) # Pivot DataFrame. col_header = [ 'Part A', 'Part B', 'Part C', 'Part E', 'Part F', 'Units', 'Data Type' ] if 'Study' in df_out.columns: col_header.insert(0, 'Study') df_out.set_index(col_header + ['DateTime'], append=True, inplace=True) df_out.reset_index(0, drop=True, inplace=True) df_out = df_out['Value'] df_out = df_out.unstack(col_header) df_out.index.freq = pd.infer_freq(df_out.index, warn=False) # Return DataFrame. return df_out
def periodicity(freq_or_frame): """ resolve the number of periods per year """ if hasattr(freq_or_frame, 'rule_code'): rc = freq_or_frame.rule_code rc = rc.split('-')[0] factor = PER_YEAR_MAP.get(rc, None) if factor is not None: return factor / abs(freq_or_frame.n) else: raise Exception('Failed to determine periodicity. No factor mapping for %s' % freq_or_frame) elif isinstance(freq_or_frame, basestring): factor = PER_YEAR_MAP.get(freq_or_frame, None) if factor is not None: return factor else: raise Exception('Failed to determine periodicity. No factor mapping for %s' % freq_or_frame) elif isinstance(freq_or_frame, (pd.Series, pd.DataFrame, pd.TimeSeries)): freq = freq_or_frame.index.freq if not freq: freq = pd.infer_freq(freq_or_frame.index) if freq: return periodicity(freq) else: # Attempt to resolve it import warnings freq = guess_freq(freq_or_frame.index) warnings.warn('frequency not set. guessed it to be %s' % freq) return periodicity(freq) else: return periodicity(freq) else: raise ValueError("periodicity expects DataFrame, Series, or rule_code property")
def scale_profile(profile, weight): """Scale hourly profile using a list of monthly weights. :param pandas.DataFrame profile: hourly profile. :param list weight: list of monthly weights. :return: (*pandas.DataFrame*) -- scaled hourly profile. :raises TypeError: if profile is not a time series or weight is not a list. :raises ValueError: if frequency of time series is not 1h or size of weight is not 12 """ if not isinstance(profile, pd.Series): raise TypeError("profile must be a pandas.Series object") if not isinstance(weight, list): raise TypeError("weight must be a list") if pd.infer_freq(profile.index) != "H": raise ValueError("frequency of time series must be 1h") if len(weight) != 12: raise ValueError("the list of weight must have exactly 12 elements") monthly_profile = profile.resample("M").sum(min_count=24 * 28) monthly_factor = [t / p for t, p in zip(weight, monthly_profile.values)] hourly_factor = (pd.Series( monthly_factor, index=pd.date_range(profile.index.min(), periods=12, freq="MS"), ).resample("H").ffill().reindex(profile.index, method="ffill")) return profile * hourly_factor
def validate_continuous_fasts(fasts: pd.Series) -> bool: """ Validate a continuous log of fasts for use by other module functions. Validations: - Frequency of series index is 1 minute ('T') - Value at each time step is either 0 or 1 (0 ~ not fasting, 1 ~ fasting), no extraneous or NaN values Args: fasts: Series of continuous logs with a datetime index at a 1 minute frequency and values of 0 or 1. Returns: True if the fasts series is valid. """ # Validate frequency of index is 1 minute ('T') freq = pd.infer_freq(fasts.index) if freq != 'T': raise ValueError(f""" Frequency of the continuous fast must be: 'T' (1 minute). Frequency of fasts series input: {freq}. """) # Validate values only contain 0 or 1 if not fasts.isin([0, 1]).all(): unexpected_values = fasts[((fasts != 0) & (fasts != 1))] raise ValueError(f""" Continuous fast (input to fasts) must contain only values of 0 or 1. Check fasts for extraneous or NaN values: {unexpected_values} """) return True
def fit(self, X, y=None): """Check if `date_col` has daily frequency This check is in `fit` method since pandas.infer_freq is used which requires at least 3 observations. Parameters ---------- X : pandas.DataFrame Input features. y : Any Ignored Returns ------- HolidayTransformer self Raises ------ ValueError in case daily frequency is not used or very few datapoints are provided in X """ if pd.infer_freq(X.index) != "D": raise ValueError( f"HolidayTransformer can be used only with daily frequency in index. " f"Your index is of type {type(X.index)} with frequency {pd.infer_freq(X.index)}" ) return self
def _reindex(self, data, times, columns): if len(data) != len(times): if self.resample: # Resample at a specific frequency kwargs = {"periods": len(data)} if self.resample_rate is None: kwargs["freq"] = pd.infer_freq(times) kwargs["freq"] = pd.tseries.frequencies.to_offset(kwargs["freq"]) else: kwargs["freq"] = pd.DateOffset(seconds=1 / self.resample_rate) if self.resample_direction == "right": kwargs["start"] = times[0] elif self.resample_direction == "left": kwargs["end"] = times[-1] else: def middle(a): return int(np.ceil(len(a) / 2)) - 1 kwargs["start"] = times[middle(times)] - ( middle(data) * kwargs["freq"] ) times = pd.date_range(**kwargs) else: # Linearly arange between first and last times = pd.date_range(start=times[0], end=times[-1], periods=len(data)) return pd.DataFrame(data, times, columns)
def infer_periodocity(train): perd = pd.infer_freq(train.index) if perd in ["MS", "M", "BM", "BMS"]: periodocity = 12 elif perd in ["BH", "H"]: periodocity = 24 elif perd == "B": periodocity = 5 elif perd == "D": periodocity = 7 elif perd in [ "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT" ]: periodocity = 52 elif perd in ["Q", "QS", "BQ", "BQS"]: periodocity = 4 elif perd in ["A", "BA", "AS", "BAS"]: periodocity = 10 elif perd in ["T", "min"]: periodocity = 60 elif perd == "S": periodocity = 60 elif perd in ["L", "ms"]: periodocity = 1000 elif perd in ["U", "us"]: periodocity = 1000 elif perd == "N": periodocity = 1000 return periodocity
def read_knmi(fname, variables='RD'): """This method can be used to import KNMI data. Parameters ---------- fname: str Filename and path to a Dino file. variables: str String with the variable name to extract. Returns ------- ts: pastas.TimeSeries returns a Pastas TimeSeries object or a list of objects. """ knmi = KnmiStation.fromfile(fname) if variables is None: variables = knmi.variables.keys() if isinstance(variables, str): variables = [variables] stn_codes = knmi.data['STN'].unique() ts = [] for code in stn_codes: for variable in variables: if variable not in knmi.data.keys(): raise (ValueError( "variable %s is not in this dataset. Please use one of " "the following keys: %s" % (variable, knmi.data.keys()))) series = knmi.data.loc[knmi.data['STN'] == code, variable] # get rid of the hours when data is daily if pd.infer_freq(series.index) == 'D': series.index = series.index.normalize() metadata = {} if knmi.stations is not None and not knmi.stations.empty: station = knmi.stations.loc[code, :] metadata['x'] = station.LON_east metadata['y'] = station.LAT_north metadata['z'] = station.ALT_m metadata['projection'] = 'epsg:4326' stationname = station.NAME else: stationname = str(code) metadata['description'] = knmi.variables[variable] if variable == 'RD' or variable == 'RH': kind = 'prec' elif variable == 'EV24': kind = 'evap' else: kind = None ts.append(TimeSeries(series, name=variable + ' ' + stationname, metadata=metadata, settings=kind)) if len(ts) == 1: ts = ts[0] return ts
def xlsx_to_pandas(file_in,header=True,header_row=0,skiprows_after_header=0,date_col=True,regularise=True,worksheets=[]): xl_book=xlrd.open_workbook(file_in) d={} start_date='1900-01-01' end_date='2100-01-01' if not worksheets: get_sheets=xl_book.sheet_names() else: get_sheets=worksheets for sheet_name in get_sheets: sheet=xl_book.sheet_by_name(sheet_name) rows=sheet.nrows cols=sheet.ncols if rows==0: print 'Could not find any valid rows' if cols==0: print 'Could not find any valid columns' if rows!=0 and cols!=0: if header==True: if date_col==True: column_names=[str(sheet.cell_value(header_row,i)) for i in range(cols)] index=[] for i in xrange(header_row+skiprows_after_header+1,sheet.nrows): try: index.append(dt.datetime(*xlrd.xldate_as_tuple(sheet.cell_value(i,0), xl_book.datemode))) except ValueError: index.append('') print 'Error in sheet '+sheet_name+' at row '+str(i)+'; missing or invalid datetime stamp! Skipping...' df=pd.DataFrame(columns=column_names[1:],index=index) for i in range(1,cols): arr=np.array(sheet.col_values(i)[header_row+skiprows_after_header+1:]) arr[arr=='']='-9999' df[column_names[i]]=arr.astype(np.float) if regularise==True: df_freq=pd.infer_freq(df.index) df_ind=pd.date_range(start=df.index[0],end=df.index[-1],freq=df_freq) df=df.reindex(df_ind) d[sheet_name]=df else: d[sheet_name]=pd.DataFrame() return d # Multiple sheets are returned as dictionary (pandas dataframe) objects # Note XLRD cell type codes: # XL_CELL_EMPTY: 0 # XL_CELL_TEXT: 1 (STRING) # XL_CELL_NUMBER: 2 (FLOAT) # XL_CELL_DATE: 3 (FLOAT) # XL_CELL_BOOLEAN: 4 (INT) # XL_CELL_ERROR: 5 (INTERNAL EXCEL CODE) # XL_CELL_BLANK: 6 (EMPTY STRING)
def _QC(self): interval = int(filter(lambda x: x.isdigit(), pd.infer_freq(self.df.index))) assert interval % 30 == 0 recs_per_day = 1440 / interval self.recs_per_day = recs_per_day return
def shift_dates(self,h): """ Auxiliary function for creating dates for forecasts Parameters ---------- h : int How many steps to forecast Returns ---------- A transformed date_index object """ date_index = copy.deepcopy(self.index) date_index = date_index[self.max_lag:len(date_index)] if self.is_pandas is True: if isinstance(date_index,pd.tseries.index.DatetimeIndex): if pd.infer_freq(date_index) == 'H' or pd.infer_freq(date_index) == 'M' or pd.infer_freq(date_index) == 'S': for t in range(h): date_index += pd.DateOffset((date_index[len(date_index)-1] - date_index[len(date_index)-2]).seconds) else: # Assume higher frequency (configured for days) for t in range(h): date_index += pd.DateOffset((date_index[len(date_index)-1] - date_index[len(date_index)-2]).days) elif isinstance(date_index,pd.core.index.Int64Index): for i in range(h): new_value = date_index.values[len(date_index.values)-1] + (date_index.values[len(date_index.values)-1] - date_index.values[len(date_index.values)-2]) date_index = pd.Int64Index(np.append(date_index.values,new_value)) else: for t in range(h): date_index.append(date_index[len(date_index)-1]+1) return date_index
def _check_period_index(x, freq="M"): from pandas import PeriodIndex, DatetimeIndex if not isinstance(x.index, (DatetimeIndex, PeriodIndex)): raise ValueError("The index must be a DatetimeIndex or PeriodIndex") if x.index.freq is not None: inferred_freq = x.index.freqstr else: inferred_freq = pd.infer_freq(x.index) if not inferred_freq.startswith(freq): raise ValueError("Expected frequency {}. Got {}".format(inferred_freq, freq))
def add_freq(idx, freq=None): """Add a frequency attribute to idx, through inference or directly. Returns a copy. If `freq` is None, it is inferred. """ idx = idx.copy() if freq is None: if idx.freq is None: freq = pd.infer_freq(idx) else: return idx idx.freq = pd.tseries.frequencies.to_offset(freq) if idx.freq is None: raise AttributeError('no discernible frequency found to `idx`. Specify' ' a frequency string with `freq`.') return idx
def __init__(self, dataframe, resample = True, names_dict = None, insolation_threshold = 10, season_routine = 'standard'): interval = int(filter(lambda x: x.isdigit(), pd.infer_freq(dataframe.index))) assert interval % 30 == 0 assert season_routine in ['standard', 'barr'] if not names_dict: self.external_names = self._define_default_external_names() else: self.external_names = names_dict self.df = utils.rename_df(dataframe, self.external_names, self._define_default_internal_names()) self.resample = resample self.insolation_threshold = insolation_threshold self.season_routine = season_routine self.interval = interval self.season_n = 1000 if interval == 30 else 600 self.bin_n = 5 if interval == 30 else 3 self.valid_years_list = self._get_valid_years()
def _get_stats_and_qc(self): interval = int(filter(lambda x: x.isdigit(), pd.infer_freq(self.df.index))) if not interval % 30 == 0: raise RuntimeError('Dataset datetime index is non-contiguous - ' 'exiting') df_length = len(self.df) model_length = len(self.df.loc[pd.isnull(self.df.Model) == 0]) obs_length = len(self.df.loc[pd.isnull(self.df.Observations) == 0]) pct_available = obs_length / float(df_length) * 100 if model_length != df_length: raise RuntimeError('{} missing values in model series... aborting' .format(str(df_length - model_length))) if pct_available < self.minimum_pct: raise RuntimeError('Insufficient data to proceed (minimum % ' 'set to {0}, encountered only {1}%)... ' 'returning' .format(str(self.minimum_pct), round(str(pct_available), 1))) self.interval = interval self.pct_available = pct_available return
def validate_series(self, series): """ This method performs some PASTAS specific tests for the TimeSeries. Parameters ---------- series: pd.Series Pandas series object containing the series time series. Returns ------- series: pandas.Series The validated series as pd.Series Notes ----- The Series are validated for the following cases: 1. Series is an actual pandas Series; 2. Nan-values from begin and end are removed; 3. Nan-values between observations are removed; 4. Indices are in Timestamps (standard throughout PASTAS), making the index a pandas DateTimeIndex. 5. Duplicate indices are removed (by averaging). """ # 2. Make sure the indices are Timestamps and sorted series = series.astype(float) series.index = pd.to_datetime(series.index) series = series.sort_index() series.index.name = "" # 3. Drop nan-values at the beginning and end of the time series series = series.loc[series.first_valid_index():series.last_valid_index( )].copy(deep=True) # 4. Find the frequency of the original series if self.freq_original: pass elif pd.infer_freq(series.index): self.freq_original = pd.infer_freq(series.index) logger.info("Inferred frequency from time series %s: freq=%s " % ( self.name, self.freq_original)) else: self.freq_original = self.settings["freq"] if self.freq_original is None: logger.info( "Cannot determine frequency of series %s" % self.name) elif self.settings["fill_nan"] and self.settings["fill_nan"] != \ "drop": logger.warning("User-provided frequency is applied when " "validating the Time Series %s. Make sure the " "provided frequency is close to the real " "frequency of the original series." % self.name) # 5. Handle duplicate indices if not series.index.is_unique: logger.warning("duplicate time-indexes were found in the Time " "Series %s. Values were averaged." % self.name) grouped = series.groupby(level=0) series = grouped.mean() # 6. drop nan-values if series.hasnans: series = self.fill_nan(series) if self.settings["tmin"] is None: self.settings["tmin"] = series.index.min() if self.settings["tmax"] is None: self.settings["tmax"] = series.index.max() return series
# expedience. import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt data = [ 446.6565, 454.4733, 455.663, 423.6322, 456.2713, 440.5881, 425.3325, 485.1494, 506.0482, 526.792, 514.2689, 494.211 ] index = pd.DatetimeIndex(start='1996', end='2008', freq='A') oildata = pd.Series(data, index) oildata.index = pd.DatetimeIndex( oildata.index, freq=pd.infer_freq(oildata.index)) data = [ 17.5534, 21.86, 23.8866, 26.9293, 26.8885, 28.8314, 30.0751, 30.9535, 30.1857, 31.5797, 32.5776, 33.4774, 39.0216, 41.3864, 41.5966 ] index = pd.DatetimeIndex(start='1990', end='2005', freq='A') air = pd.Series(data, index) air.index = pd.DatetimeIndex(air.index, freq=pd.infer_freq(air.index)) data = [ 263.9177, 268.3072, 260.6626, 266.6394, 277.5158, 283.834, 290.309, 292.4742, 300.8307, 309.2867, 318.3311, 329.3724, 338.884, 339.2441, 328.6006, 314.2554, 314.4597, 321.4138, 329.7893, 346.3852, 352.2979, 348.3705, 417.5629, 417.1236, 417.7495, 412.2339, 411.9468, 394.6971, 401.4993, 408.2705, 414.2428
def _resample_date_range(self, date_range, freq): orig_freq_str = pd.infer_freq(date_range) orig_freq = pd.tseries.frequencies.to_offset(orig_freq_str) min_date = date_range[0] max_date = date_range[-1] + orig_freq return pd.date_range(min_date, max_date, freq=freq, closed="left")
def correctDrift(drifted, correct_drifted_vars=None, correct=None, get_fit=True, write_fit=True, fit_file='correctDrift_linfit.params', apply_fit=True, show_plot=False, return_plot=False, units={}, return_index=False): """ Parameters ----------- correct: pandas.DataFrame dataset with the correct averages drifted: pandas.DataFrame dataset with the averages that need to be corrected correct_drifted_vars: dict dictionary where every key is a var in the right dataset and its value is its correspondent in the drifted dataset get_fit: bool whether ot not to fit a linear relation between both datasets. Generally slow. Should only be done once write_fit: bool if get_fit == True, whether or not to write the linear fit to a file (recommended) fit_file: string where to write the linear fit (if one is written) or from where to read the linear fit (if no fit is written) apply_fit: bool whether of not to apply the lineat fit and correct the data (at least get_fit and fit_file must be true) show_plot: bool whether or not to show drifted vs correct plot, to see if it's a good fit units: dict if given, it creates a {file_file}.units file, to tell write down in which units data has to be in order to be correctly corrected return_index: bool whether to return the indexes of the used points for the calculation. Serves to check the regression Returns ------- outdf: pandas.DataFrame drifted dataset corrected with right dataset """ from matplotlib import pyplot as plt import pandas as pd import numpy as np if correct_drifted_vars: rwvars = correct_drifted_vars else: if len(correct.columns)==1: rwvars = { cor : dft for cor, dft in zip(correct.columns, drifted.columns) } else: raise NameError('If correct is not provided or has more than one column, you should provide correct_drifted_vars.') cors=[] #---------------- # This option is activated if we provide a correct dataset from which to withdraw the correction parameters if get_fit: for slw, fst in rwvars.iteritems(): slow=correct[slw] fast=drifted[fst] #---------------- # Check to see if the frequency in both datasets are the same. Otherwise we are comparing different things try: if pd.infer_freq(correct.index) == pd.infer_freq(drifted.index): slow, fast = map(np.array, [slow, fast] ) else: print('Frequencies must be the same, however, inferred frequencies appear to be different. Plese check.') except TypeError: print('Cannot determine if frequencies are the same. We will continue but you should check') slow, fast = map(np.array, [slow, fast] ) #---------------- #---------------- # Does the 1D fitting filtering for NaN values (very important apparently) idx = np.isfinite(slow) & np.isfinite(fast) coefs, residuals, rank, singular_vals, rcond = np.polyfit(fast[idx], slow[idx], 1, full=True) #---------------- if show_plot: plt.title('{} vs {}'.format(fst, slw)) plt.plot(fast[idx], slow[idx], marker='o', linestyle='') plt.plot(fast[idx], np.poly1d(coefs)(fast[idx]), '-', linewidth=2) plt.xlabel(fst) plt.ylabel(slw) plt.grid(True) fig = plt.gcf() plt.show() if return_plot: return fig correc=pd.DataFrame(columns=[ '{}_{}'.format(slw, fst) ], index=['angular', 'linear'], data=coefs).transpose() cors.append(correc) cors = pd.concat(cors, join='outer') print(cors) #---------------- #---------------- # Writes the fit parameters in a file to be used later if write_fit: cors.index.name='correct_drifted' cors.to_csv(fit_file, index=True) if units: with open(fit_file+'.units', 'wt') as fou: for key, item in units.iteritems(): fou.write('{"%s":"%s"}\n' % (key,item)) #---------------- #---------------- # If you do not want to correct from an existing correct dataset. A file with the parameters must be read else: cors=pd.read_csv(fit_file, index_col=0, header=0) #---------------- #------------ # Applies the fit column by column if apply_fit: corrected=drifted.copy() for slw, fst in rwvars.iteritems(): coefs = np.array(cors.loc['{}_{}'.format(slw,fst), ['angular','linear']]) corrected[ fst ] = np.poly1d(coefs)(drifted[ fst ]) else: corrected=drifted.copy() #------------ #---------------- # The returning of the index idx is done mainly for checking purposes if return_index: return corrected, idx else: return corrected
stdata[i] = 0 plt.figure() plt.plot(stdata) stdiff = np.diff(stdata,n=1) plt.figure() plt.plot(stdiff) rng = pd.date_range(start = '2016-01-01', periods = data.size, freq ='H') D = pd.DataFrame(data).set_index(rng) dm = pd.DataFrame(D.resample('D').mean()) dm = dm.set_index(pd.date_range(start = '2016-01-01', periods = dm.shape[0], fraq = 'D')) dm.index = pd.DatetimeIndex(pd.date_range(start = '2016-01-01', periods = dm.shape[0], fraq = 'D')) pd.infer_freq(dm) dmdiff = np.diff(dm[dm.columns[0]]) dm.plot() plt.figure() plt.plot(dmdiff) np.mean(dmdiff) #dm.to_csv('daily_means_2016.csv', sep=',') #pd.DataFrame(dmdiff).to_csv('diff_daily_means_2016.csv', sep=',') #### decompose dm and dmdiff ####
def setup_class(cls): #Changed for backwards compatability with pandas #oildata_oil_json = '{"851990400000":446.6565229,"883526400000":454.4733065,"915062400000":455.662974,"946598400000":423.6322388,"978220800000":456.2713279,"1009756800000":440.5880501,"1041292800000":425.3325201,"1072828800000":485.1494479,"1104451200000":506.0481621,"1135987200000":526.7919833,"1167523200000":514.268889,"1199059200000":494.2110193}' #oildata_oil = pd.read_json(oildata_oil_json, typ='Series').sort_index() data = [446.65652290000003, 454.47330649999998, 455.66297400000002, 423.63223879999998, 456.27132790000002, 440.58805009999998, 425.33252010000001, 485.14944789999998, 506.04816210000001, 526.79198329999997, 514.26888899999994, 494.21101929999998] index= ['1996-12-31 00:00:00', '1997-12-31 00:00:00', '1998-12-31 00:00:00', '1999-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00', '2002-12-31 00:00:00', '2003-12-31 00:00:00', '2004-12-31 00:00:00', '2005-12-31 00:00:00', '2006-12-31 00:00:00', '2007-12-31 00:00:00'] oildata_oil = pd.Series(data, index) oildata_oil.index = pd.DatetimeIndex(oildata_oil.index, freq=pd.infer_freq(oildata_oil.index)) cls.oildata_oil = oildata_oil #air_ausair_json = '{"662601600000":17.5534,"694137600000":21.8601,"725760000000":23.8866,"757296000000":26.9293,"788832000000":26.8885,"820368000000":28.8314,"851990400000":30.0751,"883526400000":30.9535,"915062400000":30.1857,"946598400000":31.5797,"978220800000":32.577569,"1009756800000":33.477398,"1041292800000":39.021581,"1072828800000":41.386432,"1104451200000":41.596552}' #air_ausair = pd.read_json(air_ausair_json, typ='Series').sort_index() data = [17.5534, 21.860099999999999, 23.886600000000001, 26.929300000000001, 26.888500000000001, 28.831399999999999, 30.075099999999999, 30.953499999999998, 30.185700000000001, 31.579699999999999, 32.577568999999997, 33.477398000000001, 39.021580999999998, 41.386431999999999, 41.596552000000003] index= ['1990-12-31 00:00:00', '1991-12-31 00:00:00', '1992-12-31 00:00:00', '1993-12-31 00:00:00', '1994-12-31 00:00:00', '1995-12-31 00:00:00', '1996-12-31 00:00:00', '1997-12-31 00:00:00', '1998-12-31 00:00:00', '1999-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00', '2002-12-31 00:00:00', '2003-12-31 00:00:00', '2004-12-31 00:00:00'] air_ausair = pd.Series(data, index) air_ausair.index = pd.DatetimeIndex(air_ausair.index, freq=pd.infer_freq(air_ausair.index)) cls.air_ausair = air_ausair #livestock2_livestock_json = '{"31449600000":263.917747,"62985600000":268.307222,"94608000000":260.662556,"126144000000":266.639419,"157680000000":277.515778,"189216000000":283.834045,"220838400000":290.309028,"252374400000":292.474198,"283910400000":300.830694,"315446400000":309.286657,"347068800000":318.331081,"378604800000":329.37239,"410140800000":338.883998,"441676800000":339.244126,"473299200000":328.600632,"504835200000":314.255385,"536371200000":314.459695,"567907200000":321.413779,"599529600000":329.789292,"631065600000":346.385165,"662601600000":352.297882,"694137600000":348.370515,"725760000000":417.562922,"757296000000":417.12357,"788832000000":417.749459,"820368000000":412.233904,"851990400000":411.946817,"883526400000":394.697075,"915062400000":401.49927,"946598400000":408.270468,"978220800000":414.2428}' #livestock2_livestock = pd.read_json(livestock2_livestock_json, typ='Series').sort_index() data = [263.91774700000002, 268.30722200000002, 260.662556, 266.63941899999998, 277.51577800000001, 283.834045, 290.30902800000001, 292.474198, 300.83069399999999, 309.28665699999999, 318.33108099999998, 329.37239, 338.88399800000002, 339.24412599999999, 328.60063200000002, 314.25538499999999, 314.45969500000001, 321.41377899999998, 329.78929199999999, 346.38516499999997, 352.29788200000002, 348.37051500000001, 417.56292200000001, 417.12356999999997, 417.749459, 412.233904, 411.94681700000001, 394.69707499999998, 401.49927000000002, 408.27046799999999, 414.24279999999999] index= ['1970-12-31 00:00:00', '1971-12-31 00:00:00', '1972-12-31 00:00:00', '1973-12-31 00:00:00', '1974-12-31 00:00:00', '1975-12-31 00:00:00', '1976-12-31 00:00:00', '1977-12-31 00:00:00', '1978-12-31 00:00:00', '1979-12-31 00:00:00', '1980-12-31 00:00:00', '1981-12-31 00:00:00', '1982-12-31 00:00:00', '1983-12-31 00:00:00', '1984-12-31 00:00:00', '1985-12-31 00:00:00', '1986-12-31 00:00:00', '1987-12-31 00:00:00', '1988-12-31 00:00:00', '1989-12-31 00:00:00', '1990-12-31 00:00:00', '1991-12-31 00:00:00', '1992-12-31 00:00:00', '1993-12-31 00:00:00', '1994-12-31 00:00:00', '1995-12-31 00:00:00', '1996-12-31 00:00:00', '1997-12-31 00:00:00', '1998-12-31 00:00:00', '1999-12-31 00:00:00', '2000-12-31 00:00:00'] livestock2_livestock = pd.Series(data, index) livestock2_livestock.index = pd.DatetimeIndex( livestock2_livestock.index, freq=pd.infer_freq(livestock2_livestock.index)) cls.livestock2_livestock = livestock2_livestock #aust_json = '{"1104537600000":41.727458,"1112313600000":24.04185,"1120176000000":32.328103,"1128124800000":37.328708,"1136073600000":46.213153,"1143849600000":29.346326,"1151712000000":36.48291,"1159660800000":42.977719,"1167609600000":48.901525,"1175385600000":31.180221,"1183248000000":37.717881,"1191196800000":40.420211,"1199145600000":51.206863,"1207008000000":31.887228,"1214870400000":40.978263,"1222819200000":43.772491,"1230768000000":55.558567,"1238544000000":33.850915,"1246406400000":42.076383,"1254355200000":45.642292,"1262304000000":59.76678,"1270080000000":35.191877,"1277942400000":44.319737,"1285891200000":47.913736}' #aust = pd.read_json(aust_json, typ='Series').sort_index() data = [41.727457999999999, 24.04185, 32.328102999999999, 37.328707999999999, 46.213152999999998, 29.346326000000001, 36.482909999999997, 42.977719, 48.901524999999999, 31.180221, 37.717880999999998, 40.420211000000002, 51.206862999999998, 31.887228, 40.978262999999998, 43.772491000000002, 55.558566999999996, 33.850915000000001, 42.076383, 45.642291999999998, 59.766779999999997, 35.191876999999998, 44.319737000000003, 47.913736] index= ['2005-03-01 00:00:00', '2005-06-01 00:00:00', '2005-09-01 00:00:00', '2005-12-01 00:00:00', '2006-03-01 00:00:00', '2006-06-01 00:00:00', '2006-09-01 00:00:00', '2006-12-01 00:00:00', '2007-03-01 00:00:00', '2007-06-01 00:00:00', '2007-09-01 00:00:00', '2007-12-01 00:00:00', '2008-03-01 00:00:00', '2008-06-01 00:00:00', '2008-09-01 00:00:00', '2008-12-01 00:00:00', '2009-03-01 00:00:00', '2009-06-01 00:00:00', '2009-09-01 00:00:00', '2009-12-01 00:00:00', '2010-03-01 00:00:00', '2010-06-01 00:00:00', '2010-09-01 00:00:00', '2010-12-01 00:00:00'] aust = pd.Series(data, index) aust.index = pd.DatetimeIndex(aust.index, freq=pd.infer_freq(aust.index)) cls.aust = aust
all_utm_N[i] = utm_N all_utm_E[i] = utm_E # convert the numpy arrays to pandas dataframes df_Temp = pd.DataFrame(all_Temp) df_Prec = pd.DataFrame(all_Prec) df_Wind = pd.DataFrame(all_Wind) df_RH = pd.DataFrame(all_RH) df_SW = pd.DataFrame(all_SW) df_LW = pd.DataFrame(all_LW) #get the time variable from the original netCDF file df_Time = pd.DataFrame(dsTotal.time.values) dtIndex = pd.DatetimeIndex(dsTotal.time.values) freq = pd.infer_freq(dtIndex); #df_Time = df_Time.set_index(dtIndex) # this is slow #df_Time.apply(tz_update_utc) df_TimeTemp = pd.concat([df_Time, df_Temp], axis = 1) df_TimePrec = pd.concat([df_Time, df_Prec], axis = 1) df_TimeWind = pd.concat([df_Time, df_Wind], axis = 1) df_TimeRH = pd.concat([df_Time, df_RH], axis = 1) df_TimeSW = pd.concat([df_Time, df_SW], axis = 1) df_TimeLW = pd.concat([df_Time, df_LW], axis = 1) # add time variable to data frame # add names to the columns df_TimeTemp.columns = columnNames df_TimePrec.columns = columnNames
def asbestfreq( data, force_freq=None, ): """Test to determine best frequency to represent data. This uses several techniques. 0.5. If index is not DateTimeIndex, return 1. If force_freq is set use .asfreq. 2. If data.index.freq is not None, just return. 3. If data.index.inferred_freq is set use .asfreq. 4. Use pd.infer_freq - fails if any missing 5. Use .is_* functions to establish A, AS, A-*, AS-*, Q, QS, M, MS 6. Use minimum interval to establish the fixed time periods up to weekly 7. Gives up returning None for PANDAS offset string """ if not isinstance(data.index, pd.DatetimeIndex): return data if force_freq is not None: return data.asfreq(force_freq) ndiff = (data.index.values.astype('int64')[1:] - data.index.values.astype('int64')[:-1]) if np.any(ndiff <= 0): raise ValueError(""" * * Duplicate or time reversal index entry at * record {1} (start count at 0): * "{0}". * """.format(data.index[:-1][ndiff <= 0][0], pd.np.where(ndiff <= 0)[0][0] + 1)) if data.index.freq is not None: return data # Since pandas doesn't set data.index.freq and data.index.freqstr when # using .asfreq, this function returns that PANDAS time offset alias code # also. Not ideal at all. # # This gets most of the frequencies... if data.index.inferred_freq is not None: try: return data.asfreq(data.index.inferred_freq) except ValueError: pass # pd.infer_freq would fail if given a large dataset if len(data.index) > 100: slic = slice(None, 99) else: slic = slice(None, None) try: infer_freq = pd.infer_freq(data.index[slic]) except ValueError: infer_freq = None if infer_freq is not None: return data.asfreq(infer_freq) # At this point pd.infer_freq failed probably because of missing values. # The following algorithm would not capture things like BQ, BQS # ...etc. if np.alltrue(data.index.is_year_end): infer_freq = 'A' elif np.alltrue(data.index.is_year_start): infer_freq = 'AS' elif np.alltrue(data.index.is_quarter_end): infer_freq = 'Q' elif np.alltrue(data.index.is_quarter_start): infer_freq = 'QS' elif np.alltrue(data.index.is_month_end): if np.all(data.index.month == data.index[0].month): # Actually yearly with different ends infer_freq = 'A-{0}'.format(_ANNUALS[data.index[0].month]) else: infer_freq = 'M' elif np.alltrue(data.index.is_month_start): if np.all(data.index.month == data.index[0].month): # Actually yearly with different start infer_freq = 'A-{0}'.format(_ANNUALS[data.index[0].month] - 1) else: infer_freq = 'MS' if infer_freq is not None: return data.asfreq(infer_freq) # Use the minimum of the intervals to test a new interval. # Should work for fixed intervals. ndiff = sorted(set(ndiff)) mininterval = np.min(ndiff) if mininterval <= 0: raise ValueError if len(ndiff) == 1: ngcd = ndiff[0] else: ngcd = reduce(gcd, ndiff) if ngcd < 1000: infer_freq = '{0}N'.format(ngcd) elif ngcd < 1000000: infer_freq = '{0}U'.format(ngcd // 1000) elif ngcd < 1000000000: infer_freq = '{0}L'.format(ngcd // 1000000) elif ngcd < 60000000000: infer_freq = '{0}S'.format(ngcd // 1000000000) elif ngcd < 3600000000000: infer_freq = '{0}T'.format(ngcd // 60000000000) elif ngcd < 86400000000000: infer_freq = '{0}H'.format(ngcd // 3600000000000) elif ngcd < 604800000000000: infer_freq = '{0}D'.format(ngcd // 86400000000000) elif ngcd < 2419200000000000: infer_freq = '{0}W'.format(ngcd // 604800000000000) if np.all(data.index.dayofweek == data.index[0].dayofweek): infer_freq = infer_freq + '-{0}'.format( _WEEKLIES[data.index[0].dayofweek]) else: infer_freq = 'D' if infer_freq is not None: return data.asfreq(infer_freq) # Give up return data
def resample_to_model_data_index(df, date_index, frequencies, date_group, \ start, end, \ fill='mean', stat='50%', df_freq=None, index_report=True, label='left', debug=False, retain_na=False): pd_dt = pd.to_datetime if len(frequencies) != len(date_group) - 1: print("Frequencies list must have one less item than the date_group list") return if df_freq != None: df = df.resample(df_freq).mean() # end if df = df.loc[start:end] #However if the time period for the model is longer we need to reindex the dataseries if df_freq == None: df_freq = pd.infer_freq(df.index) # Create temporary date_index date_index_temp = pd.date_range(start=date_index[0], end=date_index[-1], \ freq=df_freq) df = df.reindex(date_index_temp) # Then we have to fill in the missing values with mean or some other descriptor df = _fill_in_time_series_nan(df, fill=fill, stat=stat) # Create empty list for placing the resampled parts of the dataframe df_resamples = [] len_frequencies = len(frequencies) for index, frequency in enumerate(frequencies): #print(frequency) p_start, p_end = date_group[index], date_group[index + 1] #resample = df[df.index.isin(pd.date_range(p_start, p_end))] \ if index < len_frequencies - 1: resample = df[(df.index >= pd_dt(p_start)) & (df.index < pd_dt(p_end))] \ .resample(frequency, label=label).mean() elif len_frequencies == 1: resample = df[(df.index >= pd_dt(p_start))] \ .resample(frequency, label='right').mean() else: resample = df[(df.index >= pd_dt(p_start))] \ .resample(frequency, label=label).mean() if debug: print resample.index if index < len_frequencies - 1: if label == 'left': df_resamples += [resample.iloc[1:]] elif label == 'right': df_resamples += [resample.iloc[:-1]] # end if elif len_frequencies == 1: df_resamples += [resample] else: df_resamples += [resample.iloc[1:]] # end if # end for df_concat = pd.concat(df_resamples) if index_report: # TODO: Report if any of the df_concat indices are not in date_index if len_frequencies > 1: if np.all(np.in1d(df_concat.index, date_index[:-1])): #np.array_equal(df_concat.index, date_index): print("Successful match of date indices for model and resampled df") else: print("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index)) import sys sys.exit("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index)) # end if else: if np.all(np.in1d(df_concat.index, date_index)): #np.array_equal(df_concat.index, date_index): print("Successful match of date indices for model and resampled df") else: print("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index)) import sys sys.exit("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index)) # end if # end if # end if # Remove the dead rows from the dataframe if there was no filling if fill == 'none' and not retain_na: df_concat = df_concat.dropna() # end if return df_concat