def interpolate(x: pd.Series, dates: Union[List[date], List[time], pd.Series] = None, method: Interpolate = Interpolate.INTERSECT) -> pd.Series: """ Interpolate over specified dates or times Stepwize interpolation of series based on dates in second series: >>> a = generate_series(100) >>> b = generate_series(100) >>> interpolate(a, b, Interpolate.INTERSECT) """ if dates is None: dates = x if isinstance(dates, pd.Series): align_series = dates else: align_series = pd.Series(np.nan, dates) if method == Interpolate.INTERSECT: # Only returns a value for valid dates return x.align(align_series, 'inner')[0] if method == Interpolate.NAN: # Value will be NaN for dates not present in the series return x.align(align_series, 'right')[0] if method == Interpolate.ZERO: # Value will be zero for dates not present in the series align_series = pd.Series(0.0, dates) return x.align(align_series, 'right', fill_value=0)[0] if method == Interpolate.STEP: # Value of the previous valid point return __interpolate_step(x, align_series) else: raise MqValueError('Unknown intersection type: ' + method)
def test_align_multiindex(): # GH 10665 midx = pd.MultiIndex.from_product( [range(2), range(3), range(2)], names=("a", "b", "c")) idx = pd.Index(range(2), name="b") s1 = Series(np.arange(12, dtype="int64"), index=midx) s2 = Series(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) res1l, res1r = s1.align(s2, join="left") res2l, res2r = s2.align(s1, join="right") expl = s1 tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) expr = Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l) res1l, res1r = s1.align(s2, join="right") res2l, res2r = s2.align(s1, join="left") exp_idx = pd.MultiIndex.from_product( [range(2), range(2), range(2)], names=("a", "b", "c")) expl = Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) expr = Series([0, 0, 1, 1] * 2, index=exp_idx) tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l)
class Align(object): def setup(self): size = 5 * 10**5 rng = np.arange(0, 10**13, 10**7) stamps = np.datetime64('now').view('i8') + rng idx1 = np.sort(np.random.choice(stamps, size, replace=False)) idx2 = np.sort(np.random.choice(stamps, size, replace=False)) self.ts1 = Series(np.random.randn(size), idx1) self.ts2 = Series(np.random.randn(size), idx2) def time_series_align_int64_index(self): self.ts1 + self.ts2 def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join='left')
def test_subtracting_two_series_with_unordered_index_and_all_nan_index( self, data_result, data_expected ): # GH 38439 a_index_result = MultiIndex.from_tuples(data_result[0]) b_index_result = MultiIndex.from_tuples(data_result[1]) a_series_result = Series(data_result[2], index=a_index_result) b_series_result = Series(data_result[3], index=b_index_result) result = a_series_result.align(b_series_result) a_index_expected = MultiIndex.from_tuples(data_expected[0]) b_index_expected = MultiIndex.from_tuples(data_expected[1]) a_series_expected = Series(data_expected[2], index=a_index_expected) b_series_expected = Series(data_expected[3], index=b_index_expected) a_series_expected.index = a_series_expected.index.set_levels( [ a_series_expected.index.levels[0].astype("float"), a_series_expected.index.levels[1].astype("float"), ] ) b_series_expected.index = b_series_expected.index.set_levels( [ b_series_expected.index.levels[0].astype("float"), b_series_expected.index.levels[1].astype("float"), ] ) tm.assert_series_equal(result[0], a_series_expected) tm.assert_series_equal(result[1], b_series_expected)
def test_align_with_dataframe_method(method): # GH31788 ser = Series(range(3), index=range(3)) df = pd.DataFrame(0.0, index=range(3), columns=range(3)) result_ser, result_df = ser.align(df, method=method) tm.assert_series_equal(result_ser, ser) tm.assert_frame_equal(result_df, df)
def test_align_series(self): rng = period_range('1/1/2000', '1/1/2010', freq='A') ts = Series(np.random.randn(len(rng)), index=rng) result = ts + ts[::2] expected = ts + ts expected[1::2] = np.nan tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) # it works! for kind in ['inner', 'outer', 'left', 'right']: ts.align(ts[::2], join=kind) msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" with tm.assert_raises_regex(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end")
def test_series_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') ser = Series(np.random.randn(len(idx1)), index=idx1) ser_central = ser.tz_convert('US/Central') # # different timezones convert to UTC new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC
def test_series_align_aware(self): idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") ser = Series(np.random.randn(len(idx1)), index=idx1) ser_central = ser.tz_convert("US/Central") # # different timezones convert to UTC new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC
def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) right = Series([1], index=pd.MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"])) result_left, result_right = left.align(right) expected_right = Series([1], index=pd.MultiIndex.from_tuples( [(1, 3, 2)], names=["a", "c", "b"])) expected_left = Series([2], index=pd.MultiIndex.from_tuples( [(1, 3, 2)], names=["a", "c", "b"])) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right)
def test_str_cat_align_indexed(index_or_series, join): # https://github.com/pandas-dev/pandas/issues/18657 box = index_or_series s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) sa, ta = s.align(t, join=join) # result after manual alignment of inputs expected = sa.str.cat(ta, na_rep="-") if box == Index: s = Index(s) sa = Index(sa) expected = Index(expected) result = s.str.cat(t, join=join, na_rep="-") assert_series_or_index_equal(result, expected)
def align_method_SERIES(left: Series, right, align_asobject: bool = False): """ align lhs and rhs Series """ # ToDo: Different from align_method_FRAME, list, tuple and ndarray # are not coerced here # because Series has inconsistencies described in #13637 if isinstance(right, ABCSeries): # avoid repeated alignment if not left.index.equals(right.index): if align_asobject: # to keep original value's dtype for bool ops left = left.astype(object) right = right.astype(object) left, right = left.align(right, copy=False) return left, right
def __interpolate_step(x: pd.Series, dates: pd.Series = None) -> pd.Series: if x.empty: raise MqValueError('Cannot perform step interpolation on an empty series') first_date = pd.Timestamp(dates.index[0]) if isinstance(x.index[0], pd.Timestamp) else dates.index[0] # locate previous valid date or take first value from series prev = x.index[0] if first_date < x.index[0] else x.index[x.index.get_loc(first_date, 'pad')] current = x[prev] curve = x.align(dates, 'right', )[0] # only need values from dates for knot in curve.iteritems(): if np.isnan(knot[1]): curve[knot[0]] = current else: current = knot[1] return curve
def test_align_series_combinations(self): df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) s = Series([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) exp1 = DataFrame( { "a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5] }, index=list("ABCDE"), ) exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) # series + frame res1, res2 = s.align(df) tm.assert_series_equal(res1, exp2) tm.assert_frame_equal(res2, exp1)
def pairs_selection_test(price1: pd.Series, price2: pd.Series, max_lag: int, convenient_periods: int) -> bool: """ Parameters ---------- price1 : pd.Series Price series of 1st security. price2 : pd.Series Price series of 2nd security. max_lag: int convenient_periods: int Specify the trading period. This will be use to filter out pairs whereby its half-life is not coherent with the trading period. Returns ------- bool whether the pairs pass the selection test. """ price1.replace([np.inf, -np.inf], np.nan, inplace=True) price1.dropna(inplace=True) price2.replace([np.inf, -np.inf], np.nan, inplace=True) price2.dropna(inplace=True) price1, price2 = price1.align(price2, join='inner') # Cointegrated pairs; Propose that the Engle-Granger test is run for the # two possible selections of the dependent variable and that the combination # that generated the lowest t-statistic is selected. _, pval1, _ = coint(price1, price2) _, pval2, _ = coint(price2, price1) pval = min(pval1, pval2) if pval >= .01: return False spread = np.log(price1 / price2) # Mean-reverting Hurst exponent; It aims to constrain false positives, # possibly arising as an effect of the multiple comparisons problem. The # condition imposed is that the Hurst exponent associated with the spread # of a given pair is enforced to be smaller than 0.5, assuring the process # leans towards mean-reversion. lags = range(2, max_lag) tau = [ np.sqrt(np.std(np.subtract(spread[lag:], spread[:-lag]))) for lag in lags ] poly = np.polyfit(np.log(lags), np.log(tau), 1) hurst_exponent = poly[0] * 2 if hurst_exponent >= 0.5: return False # Suitable half-life; the value of theta which is obtained by running a # linear regression on the difference between mean of spread and spread, # and the difference between tomorrow's value of spread and today's value model = sm.OLS((np.mean(spread) - spread).iloc[:-1], (spread.shift(-1) - spread).iloc[:-1]) results = model.fit() half_life = -np.log(2) / results.params[0] if half_life > convenient_periods: return False # Monthly Mean Crossing; Enforce that every spread crosses its mean at least # once per month, to provide enough liquidity. spread_mu = spread.resample('MS').transform('mean') delta_sign = (np.sign(spread - spread_mu).diff().dropna() != 0).astype(int) num_of_cross_per_year = delta_sign.resample('Y').sum() num_of_year = len(num_of_cross_per_year) if (num_of_cross_per_year >= 12).sum() / num_of_year < 1: return False return True
def test_align_broadcast_axis(self): # GH 13194 # First four tests for DataFrame.align(Index) # For 'right' join df = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) ts = Series([5., 6., 7.]) result = df.align(ts, join='right', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.], [pd.np.nan, pd.np.nan]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'right' join on different index result = df.align(ts, join='right', axis=1, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'left' join result = df.align(ts, join='left', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'left' join on different axis result = df.align(ts, join='left', axis=1, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # Series.align(DataFrame) tests, 'outer' join result = ts.align(df, join='outer', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) expected2 = DataFrame(np.array([[1., 2.], [3., 4.], [pd.np.nan, pd.np.nan]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # Series.align(DataFrame) tests, 'inner' join result = ts.align(df, join='inner', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[5., 5.], [6., 6.]]), columns=list('AB')) expected2 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2)
def interpolate(x: pd.Series, dates: Union[List[date], List[time], pd.Series] = None, method: Interpolate = Interpolate.INTERSECT) -> pd.Series: """ Interpolate over specified dates or times :param x: timeseries to interpolate :param dates: array of dates/times or another series to interpolate :param method: interpolation method (default: intersect) :return: timeseries with specified dates **Usage** Interpolate the series X over the dates specified by the dates parameter. This can be an array of dates or another series, in which case the index of the series will be used to specify dates Interpolation methods: ========= ======================================================================== Type Behavior ========= ======================================================================== intersect Resultant series only has values on the intersection of dates /times. Will only contain intersection of valid dates / times in the series nan Resultant series only has values on the intersection of dates /times. Value will be NaN for dates not present in the series zero Resultant series has values on all requested dates / times. The series will have a value of zero where the requested date or time was not present in the series step Resultant series has values on all requested dates / times. The series will use the value of the previous valid point if requested date does not exist. Values prior to the first date will be equivalent to the first available value ========= ======================================================================== **Examples** Stepwize interpolation of series based on dates in second series: >>> a = generate_series(100) >>> b = generate_series(100) >>> interpolate(a, b, Interpolate.INTERSECT) **See also** :func:`sub` """ if dates is None: dates = x if isinstance(dates, pd.Series): align_series = dates else: align_series = pd.Series(np.nan, dates) if method == Interpolate.INTERSECT: return x.align(align_series, 'inner')[0] if method == Interpolate.NAN: return x.align(align_series, 'right')[0] if method == Interpolate.ZERO: align_series = pd.Series(0.0, dates) return x.align(align_series, 'right', fill_value=0)[0] if method == Interpolate.STEP: return __interpolate_step(x, align_series) else: raise MqValueError('Unknown intersection type: ' + method)
def test_align_series(self, join_type): rng = period_range('1/1/2000', '1/1/2010', freq='A') ts = Series(np.random.randn(len(rng)), index=rng) ts.align(ts[::2], join=join_type)
# In[11]: s1 = s[:4] s1 # In[10]: s2 = s[1:] s2 # In[12]: s1.align(s2) # In[13]: df.align(df2, join ='inner') # In[ ]: # In[ ]: #filter and column selection in single statement
def test_align_periodindex(join_type): rng = period_range("1/1/2000", "1/1/2010", freq="A") ts = Series(np.random.randn(len(rng)), index=rng) # TODO: assert something? ts.align(ts[::2], join=join_type)