def pandas_xy_dist(df, x_col='x', y_col='y'): """ Takes in a pandas dataframe containing x and y coordinates. Calculates the euclidean distance between point pairs as wella s the cumulative distance. VARIABLES df : a pandas dataframe x_col : x column header (default 'x') y_col : y column header (default 'y') RETURN Neighbour didtance array Cumulative didtance array """ df=df.assign(x_diff=pd.rolling_apply(df[x_col], 2, \ lambda x : x[1]-x[0])) df=df.assign(y_diff=pd.rolling_apply(df[y_col], 2, \ lambda y : y[1]-y[0])) df['xy_distance']=np.hypot(df['y_diff'], df['x_diff']) spacing = df['xy_distance'].values cumulative = np.asarray(np.cumsum(df['xy_distance'])) cumulative[0]=0 df = df.drop('x_diff', 1) df = df.drop('y_diff', 1) df['cumulative']=cumulative return spacing, cumulative, df
def GenSingleAlphaWeight(self, dfData): n = len(dfData) dfAlpha = dfData[0][[self.alpha_name]].copy() for i in range(1,n): dfAlpha = pd.merge(dfAlpha, dfData[i][[self.alpha_name]], left_index=True, right_index=True, how='outer') asset_names = [] for i in range(n): asset_names.append(dfData[i].index.name) dfAlpha.columns = asset_names dfAlpha.index.name = 'Date' dfAlpha['lower'] = pd.rolling_apply(np.arange(len(dfAlpha)), self.window_length, self.threshold_calc, args=(dfAlpha, self.lower_percentile),min_periods=1) dfAlpha['upper'] = pd.rolling_apply(np.arange(len(dfAlpha)), self.window_length, self.threshold_calc, args=(dfAlpha, self.upper_percentile),min_periods=1) dfAlpha['lower'] = dfAlpha['lower'].shift(1) dfAlpha['upper'] = dfAlpha['upper'].shift(1) dfAlpha.dropna(inplace=True) dfAlpha = dfAlpha[str(self.start_date):] dfAlphaWeight = dfAlpha.apply(self.func, axis=1) del dfAlpha['upper'] del dfAlpha['lower'] del dfAlphaWeight['upper'] del dfAlphaWeight['lower'] return (dfAlpha, dfAlphaWeight)
def preProcess(zig): f1 = pd.rolling_apply(zig, 5, lambda x: 100*(x[2]-x[0])/x[0], center = True, min_periods=5) f2 = pd.rolling_apply(zig, 5, lambda x: 100*(x[4]-x[2])/x[2], center = True, min_periods=5) f3 = pd.rolling_apply(zig, 5, lambda x: 100*(x[4]-x[0])/x[0], center = True, min_periods=5) f4 = pd.rolling_apply(zig, 5, lambda x: 100*(x[3]-x[1])/x[1], center = True, min_periods=5) f5 = pd.rolling_apply(zig, 5, lambda x: 100*(x[1]-x[0])/x[0], center = True, min_periods=5) features = pd.DataFrame({'f1':f1, 'f2':f2, "f3":f3, "f4":f4, 'f5':f5}) return features.dropna()
def BBANDS(data, n = 20, m = 2): data['bbands_mid'] = ta.SMA(np.array(data[['high', 'low', 'close']].mean(axis=1)),n) data['bbands_up'] = data['bbands_mid'] + m* pd.rolling_apply(data.close, n, np.std) data['bbands_dn'] = data['bbands_mid'] - m* pd.rolling_apply(data.close, n, np.std) signal = pd.DataFrame(index = data.index) """ 当收盘价上穿上轨线,买入,信号为1 当收盘价下穿下轨线,卖空,信号为-1 参数为20 """ signal['1'] = ((data['close'] > data['bbands_up'])&(data['close'].shift(1) < data['bbands_up'].shift(1)))*1 + ((data['close'] < data['bbands_dn'])&(data['close'].shift(1) > data['bbands_dn'].shift(1)))*(-1) signal['1'] = signal['1'][signal['1'].isin([1,-1])].reindex(data.index, method='ffill') signal = signal.fillna(0) return signal
def cci(df_typ, df_c, i_period): """ http://en.wikipedia.org/wiki/Commodity_channel_index CCI = (p - SMA(p)) / (σ(p) * 0.015) p = typical price SMA = simple moving average σ = mean absolute deviation """ i_len = len(df_typ) assert i_len >= i_period df_mad = pd.rolling_apply(df_c,10,lambda x : np.fabs(x-x.mean()).mean()) df_sma = sma(df_c, i_period) df_cci = ( df_typ - df_sma) / (df_mad * 0.015) ## set values before i_period ( wait for enough data ) df_cci[:i_period-1] = 0. df_cci.name = 'cci' + str(i_period) return df_cci
def min_rolling_theta_entropy(pos, window=24, bins=24): """Compute the minimum Shannon entropy in any window. Parameters ---------- pos : DataFrame with columns x and y, indexed by frame window : number of observations per window bins : number of equally-spaced bins in distribution. Default 24. Returns ------- float : Shannon entropy Examples -------- >>> theta_entropy(t[t['particle'] == 3].set_index('frame')) >>> S = t.set_index('frame').groupby('particle').apply( ... tp.min_rolling_theta_entropy) """ disp = pos - pos.shift(1) direction = np.arctan2(disp['y'], disp['x']) bins = np.linspace(-np.pi, np.pi, bins + 1) f = lambda x: shannon_entropy(x, bins) return pd.rolling_apply(direction.dropna(), window, f).min()
def denormalize(df, col): vals = df[col].copy().dropna().sort_values().round(8) vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0]) vals = vals[vals > 1e-5] denom = vals.value_counts().idxmax() denormalized = np.round(np.array(df[col]/denom),0).astype(int) return denormalized
def run(): num_average_ticks = 12 # v=['B', 'H', 'S'] p=[0.05, 0.9, 0.05] d = pd.DataFrame(DATA[['timestamp', 'last']]) d['returns'] = compute_returns(d['last']) print(d['returns'].head()) print(d['returns'].rolling(window=2, center=False).mean().head()) print(d['returns']) sr_column = 'sharpe_ratio_{}'.format(num_average_ticks) # is to make a forward apply not a backward apply as people usually do. d[sr_column] = pd.rolling_apply(d['returns'][::-1], window=num_average_ticks, func=sharpe_ratio, center=False).fillna(0)[::-1] print(d.tail(100)) labels = ['SELL', 'HOLD', 'BUY'] d['signals'] = pd.qcut(d[sr_column], q=[0, 0.05, 0.95, 1], labels=[0, 1, 2]) print(d.head(100)) print(d['signals'].head(100)) d['signals'].astype(np.float).plot() import matplotlib.pyplot as plt plt.show()
def find_denominator(df, col): print type(df[col].dropna()) vals = df[col].dropna().sort_values().round(8) vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0]) vals = vals[vals > 0.000001] return vals.value_counts().idxmax()
def src_step1(): import numpy as np import pandas as pd import matplotlib.pyplot as plt import tushare as ts import sys sys.path.append("..") from JSONData import tdx_data_Day as tdd # http://stackoverflow.com/questions/21058333/compute-rolling-maximum-drawdown-of-pandas-series def max_dd(ser): max2here = pd.expanding_max(ser) dd2here = ser - max2here return dd2here.min() np.random.seed(0) n = 100 s = pd.Series(np.random.randn(n).cumsum()) # s.plot() # plt.show() code='999999' # d=ts.get_hist_data(code).sort_index() d=tdd.get_tdx_Exp_day_to_df(code, 'f').sort_index() rolling_dd = pd.rolling_apply(d.close, 200, max_dd, min_periods=0) df = pd.concat([d.close, rolling_dd], axis=1) df.columns = [code, 'rol_dd_10'] df.plot() plt.show()
def rollingReturn(data, horizon): ''' Function to calculate rolling returns over a horizon. rollingReturn computes the returns over a horizon Example: average 1-Year return >> averageHorizonReturn(data, 12) Input: - data (timeseries): timeseris of monthly retun data - horizon (int): window size for rolling analysis Returns: - rollingReturn (timeseries): timeseries of the same size as data ''' cleanData = utils.processData(data) if (1 <= horizon <= len(cleanData)) & isinstance(horizon, int): # Calculate rolling returns rollingReturns = pd.rolling_apply(cleanData, horizon, lambda x: np.prod(1 + x) - 1) return rollingReturns else: raise customExceptions.invalidInput('averageHorizonReturn')
def aspect_list(planet1, planet2, start, end, aspect, freq='3H', scale=1): """ return a list of aspect made by 2 planets in given time span and aspect I only need the exact day so the calculation can be simplified modify freq to get different accurance """ # we don't normalize the distance when calculating 0/180 degree. if aspect in [0, 180]: diffs = location_diff(planet1, planet2, start, end, freq, scale=scale, fit180=False) # only search 180 degree aspect when distance bigger than 90 degree. if aspect == 180: diffs_new = diffs[abs(diffs) > 90].apply( lambda x: x - _sign(x) * 180) else: diffs_new = diffs # for other aspects else: diffs = location_diff(planet1, planet2, start, end, freq, scale=scale) # now we can treat all aspect like conjection diffs_new = diffs - aspect aspectlist = pd.rolling_apply(diffs_new, 2, aspected) tindex = aspectlist[aspectlist==True].index res = pd.Series([aspect] * len(tindex), tindex) return res
def n_period_return(price_series, n): ''' Using the given price series, returns a new series corresponding to the cumulative net returns of n periods ago for each period in the series''' gross_ret = 1 + price_series.pct_change() gross_np_ret = pd.rolling_apply(gross_ret, n, lambda x: x.prod()) return gross_np_ret - 1
def plot_cumulative_returns_by_quantile(quantile_returns, bin_list=None, period=1, ax=None): if ax is None: f, ax = plt.subplots(1, 1, figsize=(18, 6)) return_wide = quantile_returns.reset_index().pivot(index='date', columns='factor_quantile', values=period) if period > 1: def compound_returns(ret, n): return (np.nanmean(ret) + 1)**(1./n) - 1 return_wide = pd.rolling_apply(return_wide, period, compound_returns, min_periods=1, args=(period,)) cum_ret = return_wide.add(1).cumprod() if bin_list is not None: cum_ret = cum_ret[bin_list] cum_ret.plot(lw=2, ax=ax) ax.legend() y_min, y_max = cum_ret.min().min(), cum_ret.max().max() ax.set(ylabel='Log Cumulative Returns', title='Cumulative Return by Quantile ({} Period Forward Return)'.format(period), xlabel='', yscale='symlog', yticks=np.linspace(y_min, y_max, 5), ylim=(y_min, y_max)) ax.yaxis.set_major_formatter(ScalarFormatter()) ax.axhline(1.0, linestyle='-', color='black', lw=1) return ax
def calculate_indicator(self, label): # self.stock.close_prices() prices = np.array(self.stock.get_data(label), dtype=np.float64) last = lambda x: x[-1] prices_last = pd.rolling_apply(prices, self.window, last) moving_average = pd.rolling_mean(prices, self.window) result = (prices_last-moving_average)/moving_average*100 # include nan return result.tolist()
def rolling_sparse_average(self, data_frame, periods): """ rolling_sparse_average - Calculates the rolling moving average of a sparse time series Parameters ---------- data_frame : DataFrame contains time series periods : int number of periods in the rolling sparse average Returns ------- DataFrame """ # 1. calculate rolling sum (ignore NaNs) # 2. count number of non-NaNs # 3. average of non-NaNs foo = lambda z: z[pandas.notnull(z)].sum() rolling_sum = pandas.rolling_apply(data_frame, periods, foo, min_periods=1) rolling_non_nans = pandas.stats.moments.rolling_count(data_frame, periods, freq=None, center=False, how=None) return rolling_sum / rolling_non_nans
def rolling_functions_tests(p, d): # Old-fashioned rolling API assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) # see note around test_rolling_dataframe for logic concerning precision assert_eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3), check_less_precise=True) assert_eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3), check_less_precise=True) assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) with ignoring(ImportError): assert_eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar')) # Test with edge-case window sizes assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs assert_eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def next_aspect_states(planet1, planet2, start=None, num=1, asps=None, freq='3H', scale=1): """return aspect state changes(station/direction) of given planets within given time span. """ if not start: start = datetime.today() elif type(start) is str: start = pd.to_datetime(start) if not asps: asps = [0, 60, 90, 120, 180] res = dict() # get last asp pre_asp = previous_aspect(planet1, planet2, start=start, asps=asps, freq=freq, scale=scale, num=1) while len(res) < num: next_asps = next_aspect(planet1, planet2, start=start, asps=asps, num=10, freq=freq, scale=scale) asps_list = pd.concat([pre_asp, next_asps]) stay_list = pd.rolling_apply(asps_list, 2, lambda xs: True if xs[0] == xs[1] else np.nan) stay_list = stay_list.dropna() for i in range(len(stay_list)): i0 = asps_list.index.get_loc(stay_list.index[i]) - 1 i1 = i0 + 1 res[asps_list.index[i0]] = -1 res[asps_list.index[i1]] = 1 pre_asp = asps_list[-1:] start = asps_list.index[-1] return pd.Series(res)
def Rate(self, time_window): """Apply rate function to all time series in this query.""" if self.time_series is None: raise RuntimeError("Rate must be called after Take*().") if self.sample_interval is None: raise RuntimeError("Resample() must be called prior to Rate().") if time_window.seconds % self.sample_interval.seconds: raise RuntimeError("Rate's time window should be divisible by sampling " "time window (rate time window: %s, sampling time " "window: %s)." % (time_window, self.sample_interval)) num_samples = time_window.seconds / self.sample_interval.seconds + 1 num_seconds = float(time_window.seconds) def Rate(x): return (x[-1] - x[0]) / num_seconds new_time_series = [] for time_serie in self.time_series: new_time_series.append(pandas.rolling_apply( time_serie, num_samples, Rate)[(num_samples - 1):]) self.time_series = new_time_series return self
def plot_cumulative_returns_by_quantile(quantile_returns, period=1, ax=None): """ Plots the cumulative returns of various factor quantiles. When 'period' N is greater than 1 the cumulative returns plot is computed building and averaging the cumulative returns of N interleaved portfolios (started at subsequent periods 1,2,3,...,N) each one rebalancing every N periods. This results in trading the factor at every value/signal computed by the factor and also the cumulative returns don't dependent on a specific starting date. Parameters ---------- quantile_returns : pd.DataFrame Cumulative returns by factor quantile. period: int, optional Period over which the daily returns are calculated ax : matplotlib.Axes, optional Axes upon which to plot. Returns ------- ax : matplotlib.Axes """ if ax is None: f, ax = plt.subplots(1, 1, figsize=(18, 6)) ret_wide = quantile_returns.reset_index()\ .pivot(index='date', columns='factor_quantile', values=period) if period > 1: # build N portfolios each rebalancing every N periods and average them ret_wide = pd.rolling_apply( ret_wide, period, # rate of 1 period returns lambda ret, period: ((np.nanmean(ret) + 1)**(1. / period)) - 1, min_periods=1, args=(period,)) cum_ret = ret_wide.add(1).cumprod() cum_ret = cum_ret.loc[:, ::-1] cum_ret.plot(lw=2, ax=ax, cmap=cm.RdYlGn_r) ax.legend() ymin, ymax = cum_ret.min().min(), cum_ret.max().max() ax.set(ylabel='Log Cumulative Returns', title='''Cumulative Return by Quantile ({} Period Forward Return)'''.format(period), xlabel='', yscale='symlog', yticks=np.linspace(ymin, ymax, 5), ylim=(ymin, ymax)) ax.yaxis.set_major_formatter(ScalarFormatter()) ax.axhline(1.0, linestyle='-', color='black', lw=1) return ax
def find_denominator(df, col): # Finds the approximate denominator used for scaling # (used to undo feature scaling) # Credit : http://bit.ly/1RV4w0y vals = df[col].dropna().sort_values().round(8) vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0]) vals = vals[vals > 0.000001] return vals.value_counts().idxmax()
def expected_ewmstd(self, window_length, decay_rate): alpha = 1 - decay_rate span = (2 / alpha) - 1 return rolling_apply( self.raw_data, window_length, lambda window: ewmstd(window, span=span)[-1], )[window_length:]
def blended_rolling_apply(series, window=2, fun=pd.np.mean): new_series = pd.Series(np.fromiter((fun(series[:i+1]) for i in range(window - 1)), type(series.values[0])), index=series.index[:window - 1]).append( pd.rolling_apply(series.copy(), window, fun)[window - 1:]) assert len(series) == len(new_series), ( "blended_rolling_apply should always return a series of the same length! len(series) = {0} != {1} = len(new_series".format( len(series), len(new_series))) assert not any(np.isnan(val) or val is None for val in new_series) return new_series
def computeInterpStep(df): steps = [] grouped = df.groupby(['race_id','User','laccid']) for ix,group in grouped: step = np.mean(pd.rolling_apply(group['ratio'],2,np.diff)) steps.append(step) interpStep = np.mean(steps) return interpStep
def rolling_apply(df, lookback, fn): index_series = pd.Series(range(len(df))) result = pd.rolling_apply( index_series, lookback, lambda ii: _window_apply(df, ii, fn)) result.index = df.index return result
def find_denominator(df, col): """ Function that trying to find an approximate denominator used for scaling. So we can undo the feature scaling. """ vals = df[col].dropna().sort_values().round(8) vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0]) vals = vals[vals > 0.000001] return vals.value_counts().idxmax()
def calculate_indicator(self): yesterday_value = lambda x: x[0] today_value = lambda x: x[1] previous_close = pd.rolling_apply( np.array(self.stock.close_prices(),dtype=np.float64), 2, yesterday_value) current_high = pd.rolling_apply( np.array(self.stock.high_prices(),dtype=np.float64), 2, today_value) current_low = pd.rolling_apply( np.array(self.stock.low_prices(),dtype=np.float64), 2, today_value) true_high = map( (lambda x,y: max(x,y)), previous_close, current_high ) true_low = map( (lambda x,y: min(x,y)), previous_close, current_low ) true_ranges = np.array(true_high,dtype=np.float64) \ - np.array(true_low,dtype=np.float64) return pd.rolling_mean(true_ranges, self.window).tolist()
def compute_sign_change_cnt(series): """ method for computing sign change counts of time series data :return: feature value """ if series is None or len(series) == 0: return np.nan sc_series = series[pd.rolling_apply(series, 2, lambda x: (x[0] > 0 > x[1]) or (x[0] < 0 < x[1])) > 0] return len(sc_series)
def calculate_indicator(self, label): prices = np.array(self.stock.get_data(label), dtype=np.float64) last = lambda x: x[-1] prices_last = pd.rolling_apply(prices, self.window, last) first = lambda x: x[0] prices_first = pd.rolling_apply(prices, self.window, first) direction = prices_last - prices_first direction[direction>0]=100 direction[direction<0]=-100 res = direction.astype(str) res[res=='100.0']='up' res[res=='-100.0']='down' res[res=='0.0']='flat' return res.tolist()
def plot_score(ax, series, labels, colors, ylabel): """Score plot where score is calculated as 90th percentile. Quite useful for trends and dips analysis.""" ax.set_ylabel("Percentile of score ({})".format(ylabel)) ax.set_xlabel("Time elapsed, sec") for s, label, color in zip(series, labels, colors): scoref = lambda x: stats.percentileofscore(x, s.quantile(0.9)) rolling_score = pd.rolling_apply(s, min(len(s) / 15, 40), scoref) ax.plot(s.index, rolling_score, label=label, color=color) plt.ylim(ymin=0, ymax=105)
print(close_px) spx_px = close_px_all['SPX'] spx_rets = spx_px / spx_px.shift(1) - 1 returns = close_px.pct_change() corr = pd.rolling_corr(returns.AAPL, spx_rets, 125, min_periods=100) corr.plot() plt.show() corr = pd.rolling_corr(returns, spx_rets, 125, min_periods=100) corr.plot() plt.show() from scipy.stats import percentileofscore score_at_2percent = lambda x: percentileofscore(x, 0.02) result = pd.rolling_apply(returns.AAPL, 250, score_at_2percent) result.plot() plt.show() ''' print('-------------------------') # 时序案例分析 # 参数初始化 discfile = 'arima_data.xls' forecastnum = 5 # 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式 data = pd.read_excel(discfile, index_col=u'日期') data = pd.DataFrame(data, dtype=np.float64) print('data : ', data) # 时序图
def getSpeeds(self, dfTrack): dfTrack['SpeedX'] = pd.rolling_apply(dfTrack["AvgPosX"], 2, self.getLastDiff); dfTrack['SpeedY'] = pd.rolling_apply(dfTrack["AvgPosY"], 2, self.getLastDiff); dfTrack['Speed'] = np.sqrt(dfTrack['SpeedX']**2 + dfTrack['SpeedY']**2); return dfTrack;
def rolling_profit_count( dataframe ): # function returns sum of count of number of profit (+1) and loss trades (-1) count = 0 for profit in dataframe: if profit >= 0: count = count + 1 else: count = count - 1 return count # Add Rolling stats to Order DF window = 15 df_ord['Roll_Profit_Count'] = pd.rolling_apply(df_ord['Profit'], window, rolling_profit_count, 1) df_ord['Roll_Mean'] = pd.rolling_mean(df_ord['Profit'], window) df_ord['Roll_std'] = pd.rolling_std(df_ord['Profit'], window) df_ord['Roll_var'] = pd.rolling_var(df_ord['Profit'], window) # Create Trend Ranges, based on visual inspection of previous graph, and add Trends to Order dataframe trend_range = [0, 6, 33, 60, 77, 86, 150, 171, 207, 222, 271, 314, 331, 348] trend_labels = [ 'UP1', 'FLAT1', 'DOWN1', 'UP2', 'DOWN2', 'FLAT2', 'DOWN3', 'UP3', 'DOWN4', 'UP4', 'DOWN5', 'UP6', 'FLAT3' ] df_ord['Trend'] = pd.cut(df_ord.Ticket, trend_range, labels=trend_labels).astype('category') # Order Dataframe with Trends added
def decay_linear(self, x, n): return pd.rolling_apply(x, n, self.decay_linear_array)
def getAccelerations(self, dfTrack): dfTrack['AccX'] = pd.rolling_apply(dfTrack["SpeedX"], 2, self.getLastDiff); dfTrack['AccY'] = pd.rolling_apply(dfTrack["SpeedY"], 2, self.getLastDiff); dfTrack['Acc'] = np.sqrt(dfTrack['AccX']**2 + dfTrack['AccY']**2); return dfTrack;
def AVEDEV(self, param): return pd.rolling_apply(param[0], param[1], lambda x: pd.DataFrame(x).mad())
def decay_exp(self, x, f, n): return pd.rolling_apply(x, n, self.decay_exp_array, args=[f])
def ts_compoundFn(arr, min_periods, max_periods): if not (max_periods): max_periods = len(arr) return pd.rolling_apply(arr, max_periods, lambda arr: (1 + arr).prod() - 1, min_periods=min_periods)
data = data.sort(["id","trade_date"]).reset_index(drop=True) data = data[data.trade_date.isin(np.arange(start_date, end_date , dtype='datetime64[D]'))] data["trade_biweek"] = [ x.year * 100 + int(datetime.datetime.strftime(x,"%U"))/2 for x in data.trade_date ] data_grouped = data.groupby(["id","trade_biweek"]) data['loss'] = data_grouped.accu_value.apply(lambda x: pd.expanding_apply(x,lambda y: (y[-1]/(np.max(y)))-1)) data['biggest_loss'] = data.loc[data_grouped.loss.idxmin(),'loss'] data['biggest_loss_day'] = data.loc[data_grouped.loss.idxmin(),'trade_date'] data_result = pd.DataFrame() data_result['biweek_first_date'] = data_grouped.trade_date.first() data_result['biweek_last_date'] = data_grouped.trade_date.last() data_result['biweek_start_value'] = data_grouped.accu_value.first() data_result['biweek_last_value'] = data_grouped.accu_value.last() data_result['earning1'] = (data_result.biweek_last_value / data_result.biweek_start_value ) - 1 data_result['earning2'] = pd.concat([ pd.rolling_apply(v.biweek_last_value,2,lambda x:(x[1]/x[0])-1) for k,v in data_result.reset_index(level=0).groupby(["id"])]).values data_result['earning'] = np.where(pd.isnull(data_result['earning2']), data_result['earning1'], data_result['earning2']) data['rtn'] = data.groupby(['id']).apply(lambda y:pd.rolling_apply(y['accu_value'],2,lambda x:(x[1]/x[0])-1)).values #it's a hacking,needa fix #data['rtn'] = data.rtn.fillna(0) data_result['volatility'] = data_grouped.rtn.std() data_result['win_days'] = data[data.rtn>= 0].groupby(["id","trade_biweek"]).rtn.count() data_result['lose_days'] = data[data.rtn< 0].groupby(["id","trade_biweek"]).rtn.count() data_result['biggest_loss_day'] = data.set_index(["id","trade_biweek"]).biggest_loss_day.dropna() data_result['biggest_loss'] = data.set_index(["id","trade_biweek"]).biggest_loss.dropna()
def ts_geomeanFn(arr, min_periods, max_periods): if not (max_periods): max_periods = len(arr) return pd.rolling_apply(arr, max_periods, lambda arr: (1 + arr).prod()**(1 / len(arr)) - 1, min_periods=min_periods)
def factor_return_rnn_predictor(df_factor_return_, start_date=datetime(2006, 5, 1), look_back=10, rnn=None, type='gru', num_internal_projection=4, dropout_probability=0.2, init='he_uniform', loss='mse', optimizer='rmsprop', nb_epoch=20, batch_size=10, save_to_csv=None, train_freq="Monthly", train_period=None, verbosity=False): if train_freq == "Monthly": offset_begin = MonthBegin() offset_end = MonthEnd() else: raise ValueError('Frequency not implemented yet') df_factor_return = deepcopy(df_factor_return_).sort_index() predict_start = start_date + DateOffset(days=0) predict_end = predict_start + offset_end last_day = df_factor_return.index[-1] predict_df_list = [] if rnn is None: rnn = {} for factor in df_factor_return.columns: rnn[factor] = RNN(look_back, type=type, num_internal_projection=num_internal_projection, dropout_probability=dropout_probability, init=init, loss=loss, optimizer=optimizer) while predict_start < last_day: print(predict_start) if train_period is None: df_train = df_factor_return[df_factor_return.index < predict_start] else: df_train = df_factor_return[ (df_factor_return.index < predict_start) & (df_factor_return.index >= (predict_start - DateOffset(days=train_period)))] if verbosity: print("train from") print(df_train.index[0]) print("to") print(df_train.index[-1]) df_predict = pd.concat([ df_train.ix[-(look_back - 1):], df_factor_return[(df_factor_return.index >= predict_start) & (df_factor_return.index <= predict_end)] ]).sort_index() df_res = pd.DataFrame(index=df_predict.index) for factor in df_predict.columns: factor_series = df_train[[factor]].as_matrix() X_train, Y_train, X_predict = series_to_matricise( factor_series, look_back, True) X_train = X_train.reshape([X_train.shape[0], 1, X_train.shape[1]]) rnn[factor].train(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size) rnn_func = lambda factor_series: rnn[factor].predict( factor_series.reshape([1, 1, look_back])) df_res[factor] = pd.rolling_apply(df_predict[factor], look_back, rnn_func) predict_df_list.append(df_res.dropna(axis=0)) predict_start = predict_end + offset_begin predict_end = predict_start + offset_end return pd.concat(predict_df_list).sort_index()
data_grouped = data.groupby(["id", "trade_biweek"]) data['loss'] = data_grouped.accu_value.apply( lambda x: pd.expanding_apply(x, lambda y: (y[-1] / (np.max(y))) - 1)) data['biggest_loss'] = data.loc[data_grouped.loss.idxmin(), 'loss'] data['biggest_loss_day'] = data.loc[data_grouped.loss.idxmin(), 'trade_date'] data_result = pd.DataFrame() data_result['biweek_start_value'] = data_grouped.accu_value.first() data_result['biweek_last_value'] = data_grouped.accu_value.last() data_result['earning1'] = (data_result.biweek_last_value / data_result.biweek_start_value) - 1 data_result['earning2'] = pd.concat([ pd.rolling_apply(v.biweek_last_value, 2, lambda x: (x[1] / x[0]) - 1) for k, v in data_result.reset_index(level=0).groupby(["id"]) ]).values #data_result['gain2'] = pd.rolling_apply(data_result['last'],2,lambda x:(x[1]/x[0])-1) data_result['earning'] = np.where(pd.isnull(data_result['earning2']), data_result['earning1'], data_result['earning2']) data['rtn'] = data.groupby( ['id']).apply(lambda y: pd.rolling_apply(y['accu_value'], 2, lambda x: (x[1] / x[0]) - 1)).values data_result['rtn_std'] = data_grouped.rtn.std()
import time array_size = 100000 foo = pd.DataFrame(np.random.uniform(size=array_size)) window_size = 50 # stupid floating point arithmatic messing up the scipy version comparison epsilon = 0.00000001 start = time.time() a = roll_rank(foo[0].values, window_size, window_size, 0.8) end = time.time() print('cython ranking: {0} seconds', end - start) def precentile_of_score(array): return stats.percentileofscore(array, 0.8) start = time.time() # adjust from percentile to rank adj_factor = window_size / 100 b = pd.rolling_apply(foo[0], window_size, precentile_of_score) * adj_factor end = time.time() print('percentileofscore ranking: {0} seconds', end - start) bar = pd.DataFrame(abs(a - b) < epsilon) if np.count_nonzero(bar[0]) == array_size - window_size + 1: print('Correct number of equal values') else: print('Incorrect number of equal values')
def product(self, x, n): return pd.rolling_apply(x, n, np.product)