def _prepare_data_from_formula( formula: str, data: DataFrame, portfolios: DataFrame) -> Tuple[DataFrame, DataFrame, str]: na_action = NAAction(on_NA="raise", NA_types=[]) orig_formula = formula if portfolios is not None: factors = dmatrix(formula + " + 0", data, return_type="dataframe", NA_action=na_action) else: formula_components = formula.split("~") portfolios = dmatrix( formula_components[0].strip() + " + 0", data, return_type="dataframe", NA_action=na_action, ) factors = dmatrix( formula_components[1].strip() + " + 0", data, return_type="dataframe", NA_action=na_action, ) return factors, portfolios, orig_formula
def from_formula(cls, formula, data, *, portfolios=None): """ Parameters ---------- formula : str Patsy formula modified for the syntax described in the notes data : DataFrame DataFrame containing the variables used in the formula portfolios : array-like, optional Portfolios to be used in the model Returns ------- model : TradedFactorModel Model instance Notes ----- The formula can be used in one of two ways. The first specified only the factors and uses the data provided in ``portfolios`` as the test portfolios. The second specified the portfolio using ``+`` to separate the test portfolios and ``~`` to separate the test portfolios from the factors. Examples -------- >>> from linearmodels.datasets import french >>> from linearmodels.asset_pricing import TradedFactorModel >>> data = french.load() >>> formula = 'S1M1 + S1M5 + S3M3 + S5M1 S5M5 ~ MktRF + SMB + HML' >>> mod = TradedFactorModel.from_formula(formula, data) Using only factors >>> portfolios = data[['S1M1', 'S1M5', 'S3M1', 'S3M5', 'S5M1', 'S5M5']] >>> formula = 'MktRF + SMB + HML' >>> mod = TradedFactorModel.from_formula(formula, data, portfolios=portfolios) """ na_action = NAAction(on_NA='raise', NA_types=[]) orig_formula = formula if portfolios is not None: factors = dmatrix(formula + ' + 0', data, return_type='dataframe', NA_action=na_action) else: formula = formula.split('~') portfolios = dmatrix(formula[0].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) factors = dmatrix(formula[1].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) mod = cls(portfolios, factors) mod.formula = orig_formula return mod
def _prepare_data_from_formula(formula, data, portfolios): na_action = NAAction(on_NA='raise', NA_types=[]) orig_formula = formula if portfolios is not None: factors = dmatrix(formula + ' + 0', data, return_type='dataframe', NA_action=na_action) else: formula = formula.split('~') portfolios = dmatrix(formula[0].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) factors = dmatrix(formula[1].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) return factors, portfolios, orig_formula
def instruments(self) -> OptionalDataFrame: """Instruments""" instr = self.components['instruments'] instr = dmatrix('0 + ' + instr, self._data, eval_env=self._eval_env, return_type='dataframe', NA_action=self._na_action) return self._empty_check(instr)
def backward_difference_coding(X_in, cols=None): """ """ X = X_in.copy(deep=True) X.columns = ['col_' + str(x) for x in X.columns.values] cols = ['col_' + str(x) for x in cols] if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Diff)" % (col, ), X) for dig in range(len(mod[0])): X[str(col) + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(str(col) + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) X.fillna(0.0) return X
def polynomial_coding(X_in, cols=None): """ """ X = X_in.copy(deep=True) X.columns = ['col_' + str(x) for x in X.columns.values] cols = ['col_' + str(x) for x in cols] if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] X.fillna(-1, inplace=True) bin_cols = [] for col in cols: mod = dmatrix("C(%s, Poly)" % (col, ), X) for dig in range(len(mod[0])): X[str(col) + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(str(col) + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X
def test_crs_with_specific_constraint(): from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix x = (-1.5)**np.arange(20) # Hard coded R values for smooth: s(x, bs="cr", k=5) # R> knots <- smooth$xp knots_R = np.array([-2216.837820053100585937, -50.456909179687500000, -0.250000000000000000, 33.637939453125000000, 1477.891880035400390625]) # R> centering.constraint <- t(qr.X(attr(smooth, "qrc"))) centering_constraint_R = np.array([[0.064910676323168478574, 1.4519875239407085132, -2.1947446912471946234, 1.6129783104357671153, 0.064868180547550072235]]) # values for which we want a prediction new_x = np.array([-3000., -200., 300., 2000.]) result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], " "lower_bound=knots_R[0], upper_bound=knots_R[-1], " "constraints=centering_constraint_R)") data_chunked = [{"x": x[:10]}, {"x": x[10:]}] new_data = {"x": new_x} builder = incr_dbuilder("cr(x, df=4, constraints='center')", lambda: iter(data_chunked)) result2 = build_design_matrices([builder], new_data)[0] assert np.allclose(result1, result2, rtol=1e-12, atol=0.)
def test_crs_with_specific_constraint(): from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix x = (-1.5)**np.arange(20) # Hard coded R values for smooth: s(x, bs="cr", k=5) # R> knots <- smooth$xp knots_R = np.array([ -2216.837820053100585937, -50.456909179687500000, -0.250000000000000000, 33.637939453125000000, 1477.891880035400390625 ]) # R> centering.constraint <- t(qr.X(attr(smooth, "qrc"))) centering_constraint_R = np.array([[ 0.064910676323168478574, 1.4519875239407085132, -2.1947446912471946234, 1.6129783104357671153, 0.064868180547550072235 ]]) # values for which we want a prediction new_x = np.array([-3000., -200., 300., 2000.]) result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], " "lower_bound=knots_R[0], upper_bound=knots_R[-1], " "constraints=centering_constraint_R)") data_chunked = [{"x": x[:10]}, {"x": x[10:]}] new_data = {"x": new_x} builder = incr_dbuilder("cr(x, df=4, constraints='center')", lambda: iter(data_chunked)) result2 = build_design_matrices([builder], new_data)[0] assert np.allclose(result1, result2, rtol=1e-12, atol=0.)
def polynomial_coding(X_in, cols=None): """ """ X = X_in.copy(deep=True) X.columns = ['col_' + str(x) for x in X.columns.values] cols = ['col_' + str(x) for x in cols] if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] X.fillna(-1, inplace=True) bin_cols = [] for col in cols: mod = dmatrix("C(Q(\"%s\"), Poly)" % (col, ), X) for dig in range(len(mod[0])): X[str(col) + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(str(col) + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X
def backward_difference_coding(X_in, cols=None): """ :param X: :return: """ X = copy.deepcopy(X_in) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Diff)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) X.fillna(0.0) return X
def sum_coding(X_in, cols=None): """ :param X: :return: """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Sum)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X
def get_X(self, states: List[MdpState], actions: List[Action], refit_scaler: bool) -> np.ndarray: """ Extract features for state-action pairs. :param states: States. :param actions: Actions. :param refit_scaler: Whether or not to refit the feature scaler before scaling the extracted features. :return: State-feature numpy.ndarray. """ X = self.feature_extractor.extract(states, actions, refit_scaler) # if no formula, then the feature extraction result must be a numpy.ndarray to be used directly. if self.formula is None: if not isinstance(X, np.ndarray): # pragma no cover raise ValueError( 'Expected feature extractor to return a numpy.ndarray if not a pandas.DataFrame' ) # formulas only work with dataframes elif isinstance(X, pd.DataFrame): X = dmatrix(self.formula, X) # invalid otherwise else: raise ValueError( f'Invalid combination of formula {self.formula} and feature extractor result {type(X)}' ) return X
def transform(self, data): df_full = self.template_data df_new = data.copy() df_patsy = pd.concat([df_full, df_new]) df_transformed = dmatrix(formula_like=self.formula, data=df_patsy, return_type='dataframe', NA_action='raise') df_return_data = df_transformed[-len(df_new):] return df_return_data
def endog(self) -> OptionalDataFrame: """Endogenous variables""" endog = self.components['endog'] endog = dmatrix('0 + ' + endog, self._data, eval_env=self._eval_env, return_type='dataframe', NA_action=self._na_action) return self._empty_check(endog)
def endog(self): """Endogenous variables""" endog = self.components['endog'] endog = dmatrix('0 + ' + endog, self._data, eval_env=self._eval_env, return_type='dataframe', NA_action=self._na_action) return endog
def instruments(self): """Instruments""" instr = self.components['instruments'] instr = dmatrix('0 + ' + instr, self._data, eval_env=self._eval_env, return_type='dataframe', NA_action=self._na_action) return instr
def dependent(self): """Dependent variable""" dep = self.components['dependent'] dep = dmatrix('0 + ' + dep, self._data, eval_env=self._eval_env, return_type='dataframe', NA_action=self._na_action) return dep
def predict(self, input_data: pd.DataFrame, issue_times: pd.DatetimeIndex) -> pd.DataFrame: resampled_data, unique_inverse = self.unique_data( input_data, issue_times) X = dmatrix(self.exog, resampled_data) return PredictionDataFrameBuilder(self, issue_times).build( np.array([ np.maximum(model.predict(X), 0)[unique_inverse] for model in self.models ]).T, )
def endog(self) -> OptionalDataFrame: """Endogenous variables""" endog = self.components["endog"] endog = dmatrix( "0 + " + endog, self._data, eval_env=self._eval_env, return_type="dataframe", NA_action=self._na_action, ) return self._empty_check(endog)
def transform(self, data): ''' First time use reduced rank transformer. Second plus times, use full rank transformer. The dataframe union that contains this transformer will automagically merge down to the same reduced rank :param data: ''' return_data = dmatrix(formula_like=self.formula, data=data, return_type='dataframe', NA_action='raise') return return_data
def dependent(self) -> DataFrame: """Dependent variable""" dep = self.components["dependent"] dep = dmatrix( "0 + " + dep, self._data, eval_env=self._eval_env, return_type="dataframe", NA_action=self._na_action, ) return dep
def __get_model_fit( self, serie: Optional[int] = None ) -> sm.RegressionResultsWrapper: if serie is None: calibration_data: pd.DataFrame = self.data.calibration_data else: calibration_data: pd.DataFrame = self.data.get_serie(serie, "calibration") return smf.wls( formula=self.formula, weights=dmatrix(self.weight, calibration_data), data=calibration_data, ).fit()
def instruments(self) -> OptionalDataFrame: """Instruments""" instr = self.components["instruments"] instr = dmatrix( "0 + " + instr, self._data, eval_env=self._eval_env, return_type="dataframe", NA_action=self._na_action, ) return self._empty_check(instr)
def estimate_trend(self, time_series_x: np.ndarray, time_series_y: np.ndarray): # Cubic spline generation (4 knots) # Durrleman and Simon (1989) recommends (0.05,0.50,0.95) for natural splines knots_array = np.quantile(time_series_x, self.quantile) knots = tuple(knots_array) reshaped_x = dmatrix( f"bs(time_series, knots = {knots}, degree = {self.degree}, include_intercept=False)", {"time_series": time_series_x}, return_type='dataframe') # Fitting Generalised linear model on transformed dataset reg_fitting = sm.GLM(time_series_y, reshaped_x).fit() # Prediction on splines trend = reg_fitting.predict(reshaped_x) return trend.to_numpy()
def transform(self, data): ''' First time use reduced rank transformer. Second plus times, use full rank transformer. The dataframe union that contains this transformer will automagically merge down to the same reduced rank :param data: ''' return_data = dmatrix(formula_like=self.formula, data=data, return_type='dataframe', NA_action='raise') if self.reference_column is None: self.reference_column = return_data.columns[0] try: return_data.drop(self.reference_column, axis=1, inplace=True) except ValueError: pass return return_data
def sum_coding(X_in): """ :param X: :return: """ X = copy.deepcopy(X_in) bin_cols = [] for col in X.columns.values: mod = dmatrix("C(%s, Sum)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols) return X
def helmert_coding(X_in): """ :param X: :return: """ X = copy.deepcopy(X_in) bin_cols = [] for col in X.columns.values: mod = dmatrix("C(%s, Helmert)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols) return X
def backward_difference_coding(X_in): """ :param X: :return: """ X = copy.deepcopy(X_in) bin_cols = [] for col in X.columns.values: mod = dmatrix("C(%s, Diff)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols) X.fillna(0.0) return X
def polynomial_coding(X_in, cols=None): """ :param X: :return: """ X = copy.deepcopy(X_in) if cols is None: cols = X.columns.values bin_cols = [] for col in cols: mod = dmatrix("C(%s, Poly)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols) return X
def dmatrix_lambda(x_parameter): return dmatrix( 'bs(x, knots=({str_knots}), degree=3, include_intercept=False)'. format(str_knots=str_knots), {'x': x_parameter}, return_type='dataframe')
X_train_yes, family=sm.families.Poisson()).fit() y_train_no, X_train_no = dmatrices(no_expr, df_las, return_type='dataframe') poisson_training_results_no = sm.GLM(y_train_no, X_train_no, family=sm.families.Poisson()).fit() # Evaluate the regression print(poisson_training_results_yes.summary()) print(poisson_training_results_no.summary()) # Then use the model to predict results for Intersections X_test_yes = dmatrix(expr, df_intersection, return_type='dataframe') poisson_predictions_yes = poisson_training_results_yes.predict(X_test_yes) X_test_no = dmatrix(expr, df_intersection, return_type='dataframe') poisson_predictions_no = poisson_training_results_no.predict(X_test_no) # And read those results into the intersection dataframe df_intersection['predicted_yes'] = poisson_predictions_yes df_intersection['predicted_no'] = poisson_predictions_no # Create two new columns in the intersection dataframe, showing the code for la and constituency intersection_index = index_table.drop_duplicates( subset=['Intersection']).set_index('Intersection') df_intersection = df_intersection.join( intersection_index.loc[:, ('CouncilArea2011Code',
import pandas as pd from patsy.highlevel import dmatrix """ https://towardsdatascience.com/the-dummys-guide-to-creating-dummy-variables-f21faddb1d40 https://www.youtube.com/watch?v=WRxHfnl-Pcs """ url = 'http://data.princeton.edu/wws509/datasets/salary.dat' df = pd.read_table(url, delim_whitespace=True) print(df.head()) # use pandas dummy = pd.get_dummies(df['sx']) print(dummy.head()) df = pd.concat([df, dummy], axis=1) print(df.head()) # use patsy dummy = dmatrix("sx", df, return_type='dataframe') df = pd.concat([df, dummy], axis=1) print(df.head())
def transform(self, data): return dmatrix(formula_like=str(data.name), data=pd.DataFrame(data.apply(str)), return_type='dataframe', NA_action='raise').drop('Intercept', axis=1)