def test_diff_axis(self): # GH 9727 df = DataFrame([[1., 2.], [3., 4.]]) assert_frame_equal(df.diff(axis=1), DataFrame( [[np.nan, 1.], [np.nan, 1.]])) assert_frame_equal(df.diff(axis=0), DataFrame( [[np.nan, np.nan], [2., 2.]]))
def test_diff_datetime_axis1(self, tz): # GH 18578 df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), 1: date_range('2010', freq='D', periods=2, tz=tz)}) if tz is None: result = df.diff(axis=1) expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), 1: pd.TimedeltaIndex(['0 days', '0 days'])}) assert_frame_equal(result, expected) else: with pytest.raises(NotImplementedError): result = df.diff(axis=1)
def test_diff_timedelta(self): # GH 4533 df = DataFrame(dict(time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], value=[1.0, 2.0])) res = df.diff() exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]) assert_frame_equal(res, exp)
def test_diff_datetime_axis0(self, tz): # GH 18578 df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), 1: date_range('2010', freq='D', periods=2, tz=tz)}) result = df.diff(axis=0) expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), 1: pd.TimedeltaIndex(['NaT', '1 days'])}) assert_frame_equal(result, expected)
def test_diff_timedelta(self): # GH 4533 df = DataFrame( dict(time=[Timestamp('20130101 9:01'), Timestamp('20130101 9:02')], value=[1.0, 2.0])) res = df.diff() exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta('00:01:00'), 1]], columns=['time', 'value']) assert_frame_equal(res, exp)
def rsi(x: pd.DataFrame, d: int = 14) -> pd.DataFrame: """Return Relative Strength Index indicator over the past d days.""" change = x.diff() upward, downward = change.copy(), change.copy() upward[change <= 0] = 0 downward[change > 0] = 0 avg_gain = upward.ewm(d, adjust=False).mean() avg_loss = abs(downward.ewm(d, adjust=False).mean()) rs = avg_gain / avg_loss rsi = 100 - 100 / (1 + rs) return rsi
def test_diff_timedelta(self): # GH 4533 df = DataFrame(dict(time=[Timestamp('20130101 9:01'), Timestamp('20130101 9:02')], value=[1.0, 2.0])) res = df.diff() exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta('00:01:00'), 1]], columns=['time', 'value']) assert_frame_equal(res, exp)
def test_diff_axis1_mixed_dtypes_negative_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 df = DataFrame({ "A": range(3), "B": 2 * np.arange(3, dtype=np.float64) }) expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) result = df.diff(axis=1, periods=-1) tm.assert_frame_equal(result, expected)
def test_diff_axis1_mixed_dtypes_large_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 df = DataFrame({ "A": range(3), "B": 2 * np.arange(3, dtype=np.float64) }) expected = df * np.nan result = df.diff(axis=1, periods=3) tm.assert_frame_equal(result, expected)
def test_diff_datetime_axis1(self, tz): # GH 18578 df = DataFrame( { 0: date_range("2010", freq="D", periods=2, tz=tz), 1: date_range("2010", freq="D", periods=2, tz=tz), } ) if tz is None: result = df.diff(axis=1) expected = DataFrame( { 0: pd.TimedeltaIndex(["NaT", "NaT"]), 1: pd.TimedeltaIndex(["0 days", "0 days"]), } ) assert_frame_equal(result, expected) else: with pytest.raises(NotImplementedError): result = df.diff(axis=1)
def test_diff_axis1_mixed_dtypes(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 df = DataFrame({ "A": range(3), "B": 2 * np.arange(3, dtype=np.float64) }) expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) result = df.diff(axis=1) tm.assert_frame_equal(result, expected) # GH#21437 mixed-float-dtypes df = DataFrame({ "a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64") }) result = df.diff(axis=1) expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) tm.assert_frame_equal(result, expected)
def trending_up(cls, ohlc: DataFrame, col: str, period: int) -> pd.Series: """ Renvoie une série booléenne si la série d'entrées a une tendance à la hausse sur les n dernières périodes. :param df: data :param period: range :return: result Series """ return pd.Series(ohlc.diff(period) > 0, name="trending_up {}".format(period))
def test_diff_timedelta(self): # GH#4533 df = DataFrame({ "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], "value": [1.0, 2.0], }) res = df.diff() exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]) tm.assert_frame_equal(res, exp)
def __get_an_attitude(df: pd.DataFrame) -> pd.DataFrame: """ The method calculates (df[n]-df[n-1])/df[n-1]. The attitude is needed to calculate R[i,t], CR[i,t], TR[i,t]. :param df: DataFrame for calculating with attitude (R[i,t], CR[i,t], TR[i,t]). :return: Calculated DateFrame. """ attitude = df.diff().div(df.shift(1)) return attitude
def obtain_jerk_signal(self, signal: pd.DataFrame) -> pd.DataFrame: """Derive signal to obtain Jerk signals Args: signal (pd.DataFrame) Returns: jerk_signal (pd.DataFrame): """ jerk_signal = signal.diff(periods=1) # Calculate difference jerk_signal.iloc[0] = jerk_signal.iloc[1] # Fillna jerk_signal = jerk_signal / ( 1 / self.fs) # Derive in time (1 / sampling frequency) return jerk_signal
def transform(self, data: pd.DataFrame) -> np.ndarray: # TODO: scale data # scaler = MinMaxScaler(feature_range=(-1, 1)) # scaler = scaler.fit(train_diff) # train_scaled = scaler.transform(train_diff) # train_unscaled = scaler.inverse_transform(train_scaled) diffed = data.diff(self.diff_lag).dropna() supervised = series.dataframe_to_supervised(diffed, self.sequence_length) rnn_examples = series.supervised_to_rnn_examples( supervised) # type: np.ndarray return rnn_examples
def processUScovidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp): """ Processes raw COVIDTracking data to be in a form for the GenerativeModel. In many cases, we need to correct data errors or obvious outliers.""" data["region"] = 'USA' #data = data.rename(columns={"state": "region"}) data["date"] = pd.to_datetime(data["date"], format="%Y%m%d") data['total'] = data['positive'] + data['negative'] data = data.set_index(["region", "date"]).sort_index() data = data[["positive", "total"]] # Now work with daily counts data = data.diff().dropna().clip(0, None).sort_index() return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
def test_diff_datetime_axis0(self, tz): # GH 18578 df = DataFrame({ 0: date_range('2010', freq='D', periods=2, tz=tz), 1: date_range('2010', freq='D', periods=2, tz=tz) }) result = df.diff(axis=0) expected = DataFrame({ 0: pd.TimedeltaIndex(['NaT', '1 days']), 1: pd.TimedeltaIndex(['NaT', '1 days']) }) assert_frame_equal(result, expected)
def test_diff_datetime_axis0(self, tz): # GH 18578 df = DataFrame({ 0: date_range("2010", freq="D", periods=2, tz=tz), 1: date_range("2010", freq="D", periods=2, tz=tz), }) result = df.diff(axis=0) expected = DataFrame({ 0: pd.TimedeltaIndex(["NaT", "1 days"]), 1: pd.TimedeltaIndex(["NaT", "1 days"]), }) assert_frame_equal(result, expected)
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp): """ Processes raw COVIDTracking data to be in a form for the GenerativeModel. In many cases, we need to correct data errors or obvious outliers.""" data = data.rename(columns={"state": "region"}) data["date"] = pd.to_datetime(data["date"], format="%Y%m%d") data = data.set_index(["region", "date"]).sort_index() data = data.loc[idx[:, :run_date], ["positive", "total"]] # Too little data or unreliable reporting in the data source. data = data.drop(["MP", "GU", "AS", "PR", "VI"]) # On Jun 5 Covidtracking started counting probable cases too # which increases the amount by 5014. # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png data.loc[idx["MI", pd.Timestamp("2020-06-05"):], "positive"] -= 5014 # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases # after implementing a new de-duplicaton process. data.loc[idx["LA", pd.Timestamp("2020-06-19"):], :] += 1666 # Now work with daily counts data = data.diff().dropna().clip(0, None) # Michigan missed 6/18 totals and lumped them into 6/19 so we've # divided the totals in two and equally distributed to both days. data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871 data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871 # Note that when we set total to zero, the model ignores that date. See # the likelihood function in GenerativeModel.build # Huge outlier in NJ causing sampling issues. data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0 # Huge outlier in CA causing sampling issues. data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0 # A bunch of incorrect values for WA data so nulling them out. data.loc[idx["WA", pd.Timestamp("2020-06-05"):pd.Timestamp("2020-06-07")], :] = 0 data.loc[idx["WA", pd.Timestamp("2020-06-20"):pd.Timestamp("2020-06-21")], :] = 0 # Outlier dates in PA data.loc[idx["PA", [ pd.Timestamp("2020-06-03"), pd.Timestamp("2020-04-21"), pd.Timestamp("2020-05-20"), ], ], :, ] = 0 return data
def test_diff_timedelta64_with_nat(self): # GH#32441 arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") arr[:, 0] = np.timedelta64("NaT", "ns") df = DataFrame(arr) result = df.diff(1, axis=0) expected = DataFrame({ 0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)] }) tm.assert_equal(result, expected) result = df.diff(0) expected = df - df assert expected[0].isna().all() tm.assert_equal(result, expected) result = df.diff(-1, axis=1) expected = df * np.nan tm.assert_equal(result, expected)
def rsi(values, period): """ Wilder の RSI を計算するのです。 * values: 調整後終値を指定するのです。 * period: 期間なのです。 * return: Wilder の RSI の値なのです。 """ _values = DataFrame(values) # 前日との差 _diff = _values.diff(1) # 上がったやつ _posi = _diff.clip_lower(0).ewm(alpha=1 / period).mean() # 下がったやつ _nega = _diff.clip_upper(0).ewm(alpha=1 / period).mean() return _posi / (_posi - _nega)
def test_diff_integer_na(self, axis, expected): # GH#24171 IntegerNA Support for DataFrame.diff() df = DataFrame( { "a": np.repeat([0, 1, np.nan, 2], 2), "b": np.tile([0, 1, np.nan, 2], 2), "c": np.repeat(np.nan, 8), "d": np.arange(1, 9) ** 2, }, dtype="Int64", ) # Test case for default behaviour of diff result = df.diff(axis=axis) tm.assert_frame_equal(result, expected)
def makePortfolio(self, start=None, end=None, capital=None): if not start: start = self.signal.index[0] if not end: end = self.signal.index[-1] if not capital: capital = self.signal.loc[start, 'Price'] positions = DataFrame(index=self.signal.index).fillna(0.0) portfolio = DataFrame(index=self.signal.index).fillna(0.0) positions[self.symbol] = self.signal['Signal'] portfolio['positions'] = (positions.multiply(self.signal['Price'], axis=0)) portfolio['cash'] = capital - (positions.diff().multiply( self.signal['Price'], axis=0)).cumsum() portfolio['total'] = portfolio['positions'] + portfolio['cash'] self.portfolio = portfolio
def data_preprocessing(table: pd.DataFrame, country_set: Set[str], test_split: float = 0.1): """ split train test set return a numpy array which each row are the data difference of each country """ country_to_drop = set(table.index) - country_set table.drop(country_to_drop, inplace=True) diff_table = table.diff(axis=1).dropna(axis=1) all_data = np.array(diff_table) X_train, X_test = train_test_split(all_data, test_size=test_split, random_state=RANDOM_SEED, shuffle=False) return X_train, X_test, all_data, diff_table.index
def create_parent_draws(parent_draws: pd.DataFrame) -> pd.DataFrame: n_child_locations = parent_draws.reset_index()['location_id'].unique().size parent_id = parent_draws['parent_id'].unique().item() del parent_draws['parent_id'] if parent_draws.index.names != ['location_id', 'date']: raise ValueError( "Multi-index differs from expected (['location_id', 'date']).") parent_draws_count = parent_draws.groupby(level=1).count().iloc[:, 0] keep_idx = parent_draws_count[parent_draws_count == n_child_locations].index nulls = parent_draws.isnull().groupby(level=1).sum() > 0 parent_draws = parent_draws.groupby(level=1).sum() parent_draws = parent_draws.cumsum() parent_draws[nulls] = np.nan parent_draws = parent_draws.loc[keep_idx] parent_draws = parent_draws.diff().fillna(parent_draws) parent_draws['location_id'] = parent_id parent_draws = (parent_draws.reset_index().set_index( ['location_id', 'date']).sort_index()) return parent_draws
def dataFrameMathTest(): #Note : The methods that return a series default to working on columns. df = DataFrame() # Load a DataFrame from a CSV file org_df = pd.read_csv('mlg.csv') df = org_df.iloc[:,1:7] resAbs = df.abs() # absolute values print(resAbs) #resAdd = df.add(o) # add df, Series or value #print(resAdd) resCount = df.count() # non NA/null values print(resCount) resCumMax = df.cummax() # (cols default axis) print(resCumMax) resCumMin = df.cummin() # (cols default axis) print(resCumMin) resCumSum = df.cumsum() # (cols default axis) print(resCumSum) resDiff = df.diff() # 1st diff (col def axis) print(resDiff) resDiv = df.div(12) # div by df, Series, value print(resDiv) #resDot = df.dot(13) # matrix dot product #print(resDot) resMax = df.max() # max of axis (col def) print(resMax) resMean = df.mean() # mean (col default axis) print(resMean) resMedian = df.median()# median (col default) print(resMedian) resMin = df.min() # min of axis (col def) print(resMin) resMul = df.mul(2) # mul by df Series val print(resMul) resSum = df.sum() # sum axis (cols default) print(resSum) resWhere = df.where(df > 0.5, other=np.nan) print(resWhere)
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp): """ Processes raw COVIDTracking data to be in a form for the GenerativeModel. In many cases, we need to correct data errors or obvious outliers.""" data = data.rename(columns={"provincia": "region"}) data = data.rename(columns={"casos_acum": "positive"}) data = data.rename(columns={"fecha": "date"}) data = data.rename(columns={"procesadas": "total"}) data["date"] = pd.to_datetime(data["date"], format="%d/%m/%Y") data = data.set_index(["region", "date"]).sort_index() data = data[["positive", "total"]] # Now work with daily counts data = data.diff().dropna().clip(0, None).sort_index() zero_filter = (data.positive >= data.total) data.loc[zero_filter, :] = 0 data.loc[idx["La Romana", pd.Timestamp("2020-12-02")], :] = 0 # At the real time of `run_date`, the data for `run_date` is not yet available! # Cutting it away is important for backtesting! return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
def create_energy_dataframe(acm_dataframe: pd.DataFrame, aggregation_count_threshold: int, max_successive_time_diff: str, aggregation_time: str) -> pd.DataFrame: """ Creates energy feature from raw accelerometer data and returns result as a dataframe. :param acm_dataframe: raw accelerometer dataframe :param aggregation_count_threshold: threshold above which we compute energy. :param max_successive_time_diff: maximum difference between successive timestamp below which we compute energy :param aggregation_time: time by which we aggreagte energy result. :return energy_dataframe: pandas dataframe object with 1 column, the energy, indexed by time """ max_successive_time_diff_boolean_mask = acm_dataframe["time"].diff( periods=1) < max_successive_time_diff consecutive_differences_dataframe = acm_dataframe.diff( periods=1)[max_successive_time_diff_boolean_mask].drop(["time"], axis=1) squared_differences_dataframe = consecutive_differences_dataframe**2 triaxial_sum_series = squared_differences_dataframe.apply(sum, axis=1) triaxial_sqrt_dataframe = triaxial_sum_series.apply(np.sqrt).to_frame() acm_dataframe_index = acm_dataframe[max_successive_time_diff_boolean_mask][ "time"] triaxial_sqrt_dataframe.index = acm_dataframe_index triaxial_sqrt_dataframe.index.name = "timestamp" count_threshold_boolean_mask = triaxial_sqrt_dataframe.resample( aggregation_time, label="right").count() > aggregation_count_threshold energy_dataframe = triaxial_sqrt_dataframe.resample(aggregation_time, label="right").sum() energy_dataframe = energy_dataframe[count_threshold_boolean_mask].dropna() energy_dataframe = energy_dataframe.rename( columns={0: "energy_by_{}".format(aggregation_time)}) return energy_dataframe
def create_energy_dataframe(acm_dataframe: pd.DataFrame, aggregation_count_threshold: int, max_successive_time_diff: str, aggregation_time: str) -> pd.DataFrame: """ TODO :param acm_dataframe: :param aggregation_count_threshold: :param max_successive_time_diff: :param aggregation_time: :return: """ max_successive_time_diff_boolean_mask = acm_dataframe["time"].diff( periods=1) < max_successive_time_diff consecutive_differences_dataframe = acm_dataframe.diff( periods=1)[max_successive_time_diff_boolean_mask].drop(["time"], axis=1) squared_differences_dataframe = consecutive_differences_dataframe**2 triaxial_sum_series = squared_differences_dataframe.apply(sum, axis=1) triaxial_sqrt_dataframe = triaxial_sum_series.apply(np.sqrt).to_frame() acm_dataframe_index = acm_dataframe[max_successive_time_diff_boolean_mask][ "time"] triaxial_sqrt_dataframe.index = acm_dataframe_index triaxial_sqrt_dataframe.index.name = "timestamp" count_threshold_boolean_mask = triaxial_sqrt_dataframe.resample( aggregation_time, label="right").count() > aggregation_count_threshold energy_dataframe = triaxial_sqrt_dataframe.resample(aggregation_time, label="right").sum() energy_dataframe = energy_dataframe[count_threshold_boolean_mask].dropna() energy_dataframe = energy_dataframe.rename( columns={0: "energy_by_{}".format(aggregation_time)}) return energy_dataframe
def precip_hyetograph_nrcs(df: pd.DataFrame) -> pd.DataFrame: """This function takes the dataframe precipitation table extracted from NOAA Atlas 14 and calculates the nested hyetograph for storm events classified by recurrence intervals. The function first retrieves the ratio of rainfall and incremental intensity; then proceeds to get the ratio, slope, and slope difference; and finally fits a parabolic curve from 0 to 9 hours that passes through the ratios at 0, 6, and 9 hours. The function then fits curves for the remaining data until 12 hours. NOTE: this function is limited to 24 hours and needs to be updated to be flexible for dfferent storm durations. """ ratio_to_24h = pd.DataFrame(np.arange(start=0, stop=241, step=1), columns = ['time']).set_index(['time']) dif = df.diff() dif.at['05m','value'] = df.at['05m','value'] df['ratio'] = df/df.at['24h','value'] i_val = {'05m': 12, '10m': 12, '15m': 12, '30m': 4, '60m': 2, '02h': 1, '03h': 1, '06h': 1./3., '12h': 1./6., '24h': 1./12.} intensity_val = pd.DataFrame.from_dict(i_val, orient='index') df.insert(1, 'increm_intensity', dif['value']*intensity_val[0], True) raw_rf = {'time':[0, 6, 9, 10.5, 11, 11.5, 11.75, 11.875, 11.917, 12, 12.083, 12.125, 12.25, 12.5, 13, 13.5, 15, 18, 24]} raw_df = pd.DataFrame(raw_rf, columns = ['time']) temp_0 = 0.5 - df.sort_values('ratio', ascending=False)['ratio']*0.5 temp_12 = 0.5 temp_24 = 1 - temp_0.sort_values(0, ascending=False) raw_df.loc[0:9, 'ratio']= temp_0.values raw_df.loc[9:18, 'ratio'] = temp_24.values raw_df.loc[9, 'ratio'] = temp_12 raw_df['slope_raw'] = raw_df['ratio'].diff()/raw_df['time'].diff() raw_df.loc[0, 'slope_raw'] = 0 raw_df['slope_dif'] = raw_df.loc[0:9]['slope_raw'].diff() df2 = raw_df.set_index(['time']) a = ((2.0/3.0)*df2.at[9.0, 'ratio']-df2.at[6.0, 'ratio'])/18.0 b = (df2.at[6.0,'ratio']-36.0*a)/6.0 low_12h = 4.0*df.loc['24h','value']*(1.0/36.0+2.0/9.0*df.loc['06h','value']/df.loc['24h','value']) up_12h = 2.0/3.0*df.loc['24h','value']*(5.0/6.0+2.0/3.0*df.loc['06h','value']/df.loc['24h','value']) if b < 0.0: a=df2.at[9.0,'ratio']/81.0 b=0.0 if 18.0*a+b<0: a=(-1.0*b/18.0) b=df2.at[9.0,'ratio']/4.5 a2 = (9.0/10.5*df2.at[10.5,'ratio']-df2.at[9.0,'ratio'])/13.5 b2 = (df2.at[9.0,'ratio']-81.0*a2)/9.0 up_2 = 2.0*df.loc['24h','value']*(0.5-(df2.at[11.5, 'ratio']+3.0*df2.at[10.5, 'ratio'])/4.0)+0.01 low_2 = 2.0*df.loc['24h','value']*(0.5-(3.0*df2.at[11.5, 'ratio']+df2.at[10.5, 'ratio'])/4.0)+0.01 if df.loc['02h', 'value']<low_2: test1 = low_2 else: test1 = df.loc['02h', 'value'] if df.loc['02h', 'value']> up_2: test2 = up_2 else: test2 = df.loc['02h','value'] if test1 > test2: test3 = test1 else: test3 = test2 if test2 > test3: test4 = test2 else: test4 = test3 if test4>up_2: test_f = up_2 else: test_f = test4 a3 = 2.0*(df2.at[11.5, 'ratio']-2*(0.5-0.5*test_f/df.loc['24h', 'value'])+ df2.at[10.5, 'ratio']) b3 = df2.at[11.5, 'ratio']-df2.at[10.5, 'ratio']-22.0*a3 c3 = (0.5-0.5*test_f/df.loc['24h','value'])-121.0*a3-11.0*b3 ratio_to_24h.loc[0:90, 'ratio'] = a*np.power(ratio_to_24h.loc[0:90].index/10.0, 2)+\ b*ratio_to_24h.loc[0:90].index/10.0 ratio_to_24h.loc[91:105, 'ratio'] = a2*np.power(ratio_to_24h.loc[91:105].index/10.0, 2)+\ b2*ratio_to_24h.loc[91:105].index/10.0 ratio_to_24h.loc[106:115, 'ratio'] = a3*np.power(ratio_to_24h.loc[106:115].index/10.0, 2)+\ b3*ratio_to_24h.loc[106:115].index/10.0 + c3 ratio_to_24h['slope'] = ratio_to_24h['ratio'].diff()/0.1 if -0.867*ratio_to_24h.loc[115, 'slope']+0.4337 < 0.399: fac_116 = -0.867*ratio_to_24h.loc[115, 'slope']+0.4337 else: fac_116 = 0.399 if -0.4917*ratio_to_24h.loc[115,'slope']+0.8182 < 0.799: fac_117 = -0.4917*ratio_to_24h.loc[115,'slope']+0.8182 else: fac_117 = 0.799 ratio_to_24h.at[116, 'ratio'] = df2.at[11.5, 'ratio']+fac_116*(df2.at[11.75,'ratio']-df2.at[11.5, 'ratio']) ratio_to_24h.at[117, 'ratio'] = df2.at[11.5, 'ratio']+fac_117*(df2.at[11.75,'ratio']-df2.at[11.5, 'ratio']) ratio_to_24h.at[118, 'ratio'] = df2.at[11.75, 'ratio']+0.4*(df2.at[11.875,'ratio']-df2.at[11.75, 'ratio']) ratio_to_24h.at[119, 'ratio'] = df2.at[11.875, 'ratio']+0.6*(df2.at[11.917,'ratio']-df2.at[11.875, 'ratio']) ratio_to_24h.loc[121:240, 'ratio'] = 1-ratio_to_24h.loc[0:119, 'ratio'].sort_index(ascending=False).values ratio_to_24h.loc[120, 'ratio'] = ratio_to_24h.at[121, 'ratio']-(df.at['05m', 'ratio']+1.0/5.0* (df.at['10m','ratio']-df.at['05m','ratio'])) ratio_to_24h.loc[0, 'ratio'] = 0 ratio_to_24h['slope'] = ratio_to_24h['ratio'].diff()/0.1 ratio_to_24h.at[0, 'slope'] = 0 ratio_to_24h['t_step'] = ratio_to_24h.index*0.1 ratio_to_24h.index = ratio_to_24h.index*0.1 return ratio_to_24h
def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) result = df.diff() self.assertEqual(result[0].dtype, np.float64)
class Portfolio: """ Class for storing portfolio information Parameters ---------- tickers : string asset ticker earnings_dir: string (default = None) address of .xls file storing earnings information vol_params : dict with volatility params Attributes ---------- open_data_, close_data_, high_data_, low_data_, volume_data_ : pandas dataframe Data on daily prices open_to_open_, open_to_close_, close_to_open_: pandas dataframes vol_ : pandas dataframe daily vol positions : pandas dataframe columns = tickers trades : pandas dataframe columns = ['date', 'ticker', 'transaction'], transaction > 0 for buy and < 0 for sell """ def __init__(self, tickers, vol_params=VOL_PARAMS): self.tickers = array(tickers) #self.dates = Series(self.OpenPrices.index) #self.positions = DataFrame(index = self.dates, columns = tickers, data = 0) #self.vol = self.returns.ewm(alpha = volAlpha).std() #self.volNormRets = self.returns / self.vol.shift(-1) #self.cov = self.returns.rolling(window = covWindow) def load_data(self, db_file=None): """ Loads online data by default, or loads from database if db_file is specified. """ open, close, high, low, volume = dict(), dict(), dict(), dict(), dict() tickers_loaded = [] for ticker in self.tickers: try: if db_file: data = getDataFromDB(ticker, db_file)['daily_data'] tickers_loaded.append(ticker) else: data = getYahooData(ticker).rename(columns={ 'Adj Close': 'Close', 'Close': 'Unadj Close' }) tickers_loaded.append(ticker) open[ticker], close[ticker], high[ticker], low[ticker], volume[ticker] = \ data['Open'], data['Close'], data['High'], data['Low'], data['Volume'] except Exception as e: print('Could not load data for {0}: {1}'.format(ticker, e)) print('Loaded data for:\n {}'.format(tickers_loaded)) self.open_, self.close_, self.high_, self.low_, self.volume_ = \ DataFrame(open), DataFrame(close), DataFrame(high), DataFrame(low), DataFrame(volume) self.dates_ = Series(self.open_.index) def compute_returns(self): """ Computes open_to_open, close_to_close, open_to_close and overnight returns. """ self.open_to_open_ = self.open_.diff() self.close_to_close_ = self.close_.diff() self.open_to_close_ = self.close_ - self.open_ self.overnight_ = self.open_ - self.close_.shift(1) def compute_vol(self, method='exponential', alpha=0.05, window=20, returns='open'): """ Computes vol of assets in portfolio. """ if (method not in ['exponential', 'rolling' ]) or (returns not in ['open', 'close']): raise ValueError('Wrong inputs for computing vol!') self.vol_params_ = {'method': method, 'returns': returns} returns = self.open_to_open_ if returns == 'open' else self.close_to_close_ if method == 'exponential': self.vol_ = returns.ewm(alpha=alpha).std() self.vol_params_['alpha'] = alpha elif method == 'rolling': self.vol_ = returns.rolling(window=window).std() self.vol_params_['window'] = window def compute_cov(self, method='rolling', alpha='0.025', window=50, returns='open'): """ Computes vol of assets in portfolio. """ if (method not in ['exponential', 'rolling' ]) or (returns not in ['open', 'close']): raise ValueError('Wrong inputs for computing vol!') self.cov_params_ = {'method': method, 'returns': returns} returns = self.open_to_open_ if returns == 'open' else self.close_to_close_ if method == 'exponential': self.cov_ = returns.ewm(alpha=alpha).cov() self.corr_ = returns.ewm(alpha=alpha).corr() self.cov_params_['alpha'] = alpha elif method == 'rolling': self.cov_ = returns.rolling(window=window).cov() self.corr_ = returns.rolling(window=window).corr() self.cov_params_['window'] = window def load_earnings_data(self, earnings_dir=EARNINGS_DIR): """ Loads earnings data. """ self.eps_, self.revenue_ = dict(), dict() tickers_loaded = [] for ticker in self.tickers: try: earnings_file = fullfile(earnings_dir, ticker + '.xls') df = read_excel(earnings_file) # split dataframe into ETS and load info self.eps_[ticker] = df.loc[:'Revenue'].loc[ 'Wall St.':'Actual'].T self.revenue_[ticker] = df.loc['Revenue':].loc[ 'Wall St.':'Actual'].T tickers_loaded.append(ticker) except Exception as e: print('Could not load earnings data for {0}: {1}'.format( ticker, e)) print('Loaded earnings data for:\n {}'.format(tickers_loaded)) def load_transactions(self, transactions, augment_tickers=False, date_format='%Y%m%d'): # Loads trade and dividend info from transactions dataframe trades, dividends, cash = format_transactions(transactions, date_format=date_format) # Handle ticker info trades_tickers = unique(trades['ticker']) if augment_tickers: self.tickers = union1d(self.tickers, trades_tickers) print('The portfolio was augmented with teh following tickers:') print(setdiff1d(trades_tickers, self.tickers)) else: print( 'The trades for the following assets were not included in the portfolio:' ) print(setdiff1d(trades_tickers, self.tickers)) # Compute positions from trade info. df = trades.groupby('ticker').apply(lambda x: x.resample('D').sum()) self.positions_ = df['shares'].unstack( level='ticker').shift(1).fillna(0).cumsum().applymap(int) # Load cash flow and cash dataframes self.cash_flow_ = cash.resample('D').sum().fillna(0) self.cash_ = self.cash_flow_.cumsum() # Load dividend dataframe self.dividends_ = dividends.resample('D').sum()
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp): """ Processes raw COVIDTracking data to be in a form for the GenerativeModel. In many cases, we need to correct data errors or obvious outliers.""" data = data.rename(columns={"state": "region"}) data["date"] = pd.to_datetime(data["date"], format="%Y%m%d") data = data.set_index(["region", "date"]).sort_index() data = data[["positive", "total"]] # Too little data or unreliable reporting in the data source. data = data.drop(["MP", "GU", "AS", "PR", "VI"]) # On Jun 5 Covidtracking started counting probable cases too # which increases the amount by 5014. # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014 # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases # after implementing a new de-duplicaton process. data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666 # Now work with daily counts data = data.diff().dropna().clip(0, None).sort_index() # Michigan missed 6/18 totals and lumped them into 6/19 so we've # divided the totals in two and equally distributed to both days. data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871 data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871 # Note that when we set total to zero, the model ignores that date. See # the likelihood function in GenerativeModel.build # Huge outlier in NJ causing sampling issues. data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0 # Same tests and positives, nulling out data.loc[idx["NJ", pd.Timestamp("2020-07-25")], :] = 0 # Huge outlier in CA causing sampling issues. data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0 # Huge outlier in CA causing sampling issues. # TODO: generally should handle when # tests == # positives and that # is not an indication of positive rate. data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0 # Two days of no new data then lumped sum on third day with lack of new total tests data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174 data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296 #https://twitter.com/OHdeptofhealth/status/1278768987292209154 data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0 data.loc[idx["OH", pd.Timestamp("2020-07-09")], :] = 0 # Nevada didn't report total tests this day data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0 # A bunch of incorrect values for WA data so nulling them out. data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0 data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0 # AL reported tests == positives data.loc[idx["AL", pd.Timestamp("2020-07-09")], :] = 0 # Low reported tests data.loc[idx["AR", pd.Timestamp("2020-07-10")], :] = 0 # Positives == tests data.loc[idx["MS", pd.Timestamp("2020-07-12")], :] = 0 # Positive == Tests; lumpy reporting for CT data.loc[idx["CT", pd.Timestamp("2020-07-17")], :] = 0 data.loc[idx["CT", pd.Timestamp("2020-07-21")], :] = 0 data.loc[idx["DC", pd.Timestamp("2020-08-04")], :] = 0 # Outlier dates in PA data.loc[ idx[ "PA", [ pd.Timestamp("2020-06-03"), pd.Timestamp("2020-04-21"), pd.Timestamp("2020-05-20"), ], ], :, ] = 0 data.loc[idx["HI", pd.Timestamp("2020-08-07")], :] = 0 data.loc[idx["TX", pd.Timestamp("2020-08-08")], :] = 0 data.loc[idx["TX", pd.Timestamp("2020-08-11")], :] = 0 data.loc[idx["DE", pd.Timestamp("2020-08-14")], :] = 0 data.loc[idx["SD", pd.Timestamp("2020-08-26")], :] = 0 data.loc[idx["WA", pd.Timestamp("2020-09-22"):pd.Timestamp("2020-09-24")], :] = 0 # Zero out any rows where positive tests equal or exceed total reported tests # Do not act on Wyoming as they report positive==total most days filtering_date = pd.Timestamp('2020-07-27') zero_filter = (data.positive >= data.total) & \ (data.index.get_level_values('date') >= filtering_date) & \ (~data.index.get_level_values('region').isin(['WY'])) data.loc[zero_filter, :] = 0 # At the real time of `run_date`, the data for `run_date` is not yet available! # Cutting it away is important for backtesting! return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
# Plot buy and sell signals # up arrow when we buy one share plt.plot(buys.index[-100:], MLDataFrame.loc[buys.index]['close'][-100:], '^', markersize=10, color='red', lw=2., label='Buy'); # down arrow when we sell one share plt.plot(sells.index[-100:], MLDataFrame.loc[sells.index]['close'][-100:], 'v', markersize = 10, color='green', lw=2., label='Sell'); plt.ylabel('Price (USD)'); plt.xlabel('Date'); plt.title('Last 100 Buy and Sell signals'); plt.legend(loc='best'); plt.show() initial_capital = float(10000.0) positions = DataFrame(index=MLDataFrame.index).fillna(0.0) portfolio = DataFrame(index=MLDataFrame.index).fillna(0.0) positions['bitcoin'] = MLDataFrame['positions'] portfolio['positions'] = (positions.multiply(MLDataFrame['close'],axis=0)) portfolio['cash'] = initial_capital - (positions.diff().multiply(MLDataFrame['close'], axis=0)).cumsum() portfolio['total'] = portfolio['positions'] + portfolio['cash'] plt.plot(portfolio) plt.legend() plt.show() ... prices = MLDataFrame.copy() prices.drop(['signal'], 1, inplace=True) OHLCV = ['open', 'high', 'low', 'close', 'volume']