class TestDataFrameEvalWithFrame(object): def setup_method(self, method): self.frame = DataFrame(randn(10, 3), columns=list('abc')) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): res = self.frame.eval('a + b', engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) def test_invalid_type_for_operator_raises(self, parser, engine): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assert_raises_regex(TypeError, r"unsupported operand type\(s\) " "for .+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
class TestDataFrameEvalNumExprPandas(tm.TestCase): @classmethod def setUpClass(cls): super(TestDataFrameEvalNumExprPandas, cls).setUpClass() cls.engine = 'numexpr' cls.parser = 'pandas' tm.skip_if_no_ne() def setUp(self): self.frame = DataFrame(randn(10, 3), columns=list('abc')) def tearDown(self): del self.frame def test_simple_expr(self): res = self.frame.eval('a + b', engine=self.engine, parser=self.parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self): res = self.frame.eval('a[a < 1] + b', engine=self.engine, parser=self.parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) def test_invalid_type_for_operator_raises(self): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assertRaisesRegexp(TypeError, r"unsupported operand type\(s\) for " r".+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=self.engine, parser=self.parser)
class TestDataFrameEvalNumExprPandas(object): @classmethod def setup_class(cls): cls.engine = 'numexpr' cls.parser = 'pandas' tm.skip_if_no_ne() def setup_method(self, method): self.frame = DataFrame(randn(10, 3), columns=list('abc')) def teardown_method(self, method): del self.frame def test_simple_expr(self): res = self.frame.eval('a + b', engine=self.engine, parser=self.parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self): res = self.frame.eval('a[a < 1] + b', engine=self.engine, parser=self.parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) def test_invalid_type_for_operator_raises(self): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assert_raises_regex( TypeError, "unsupported operand type\(s\) " "for .+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=self.engine, parser=self.parser)
class TestDataFrameEvalWithFrame(object): def setup_method(self, method): self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc')) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): res = self.frame.eval('a + b', engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) @pytest.mark.parametrize('op', ['+', '-', '*', '/']) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_invalid_type_for_operator_raises(self, parser, engine): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assert_raises_regex( TypeError, r"unsupported operand type\(s\) " "for .+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_invalid_type_for_operator_raises(self, parser, engine): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assert_raises_regex(TypeError, r"unsupported operand type\(s\) " "for .+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_invalid_type_for_operator_raises(self): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) ops = '+', '-', '*', '/' for op in ops: with tm.assertRaisesRegexp(TypeError, "unsupported operand type\(s\) for " ".+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=self.engine, parser=self.parser)
def eval_formula(df: DataFrame, formula: str) -> DataFrame: try: result = df.eval(formula) except Exception: # for all cases not handled by NumExpr result = df.eval(formula, engine='python') try: # eval can introduce Infinity values (when dividing by 0), # which do not have a JSON representation. # Let's replace them by NaN: return result.replace([np.inf, -np.inf], np.nan) except Exception: # `result` is not a Series return result
def aggregate_match_sims(simdf: DataFrame, agg_func: str): """Aggregate similarities using a numexpr aggregation function. Extra functions available: ``@max(*a)``, ``@min(*a)``, ``@mean(*a)``, ``@pow(a,b)``. See also: `Pandas eval <https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#supported-syntax>`_ `Numexpr <https://numexpr.readthedocs.io/>`_ Args: simdf: DataFrame of similarities, where columns are matcher names. agg_func: Numexpr-style function. """ import warnings, tqdm with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) funcs = { "max": lambda *args: np.nanmax(args, axis=0), "min": lambda *args: np.nanmin(args, axis=0), "mean": lambda *args: np.nanmean(args, axis=0), "pow": lambda a, b: a**b, } if agg_func in funcs: agg = funcs[agg_func](*(simdf[c] for c in simdf)) # type: ignore else: agg = simdf.eval(agg_func, local_dict=funcs, engine="python") return pd.Series(agg, index=simdf.index, name=0)
def apply_to_dataframe( self, df: pd.DataFrame, column_name: str = "unnamed_response", do_query: bool = False, ) -> None: """Apply trained models to an arbitrary dataframe. This function will augment the dataframe with a new column (with a name given by the ``column_name`` argument) if it doesn't already exist. If the dataframe is empty this function does nothing. Parameters ---------- df : pandas.DataFrame Dataframe to read and augment. column_name : str Name to give the BDT response variable. do_query : bool Perform a query on the dataframe to select events belonging to the region associated with training result; necessary if the dataframe hasn't been pre-filtered. Examples -------- >>> from tdub.apply import FoldedTrainSummary >>> from tdub.frames import raw_dataframe >>> df = raw_dataframe("/path/to/file.root") >>> fr_1j1b = FoldedTrainSummary("/path/to/folded_training_1j1b") >>> fr_1j1b.apply_to_dataframe(df, do_query=True) """ if df.shape[0] == 0: log.info("Dataframe is empty, doing nothing") return None if column_name not in df.columns: log.info(f"Creating {column_name} column") df[column_name] = -9999.0 if do_query: log.info(f"applying selection filter '{self.selection_used}'") mask = df.eval(self.selection_used) X = df[self.features].to_numpy()[mask] else: X = df[self.features].to_numpy() if X.shape[0] == 0: return None y0 = self.model0.predict_proba(X)[:, 1] y1 = self.model1.predict_proba(X)[:, 1] y2 = self.model2.predict_proba(X)[:, 1] y = np.mean([y0, y1, y2], axis=0) if do_query: df.loc[mask, column_name] = y else: df[column_name] = y
def filter_records(df: pd.DataFrame, criteria: str) -> pd.DataFrame: try: result = df[df.eval(criteria)] return result except Exception as e: logger.log_error(e) return df
def expr(data: pd.DataFrame, step: str): # aliases op = step['operation'] k = step['column'] if 'column' in step else None k_new = k if 'new-column' not in step else step['new-column'] c_expr = step['expression'] if op == 'text-transform': f_expr = eval('lambda value: %s' % c_expr) data[k_new] = data[k].apply(f_expr) elif op == 'categorize': params = dict(data=data, col_name=k, categories=eval(c_expr)) params.update({'new_col_name': k_new} if 'new-column' in step else {}) categorize(**params) elif op == 'fill-na': fill = c_expr if c_expr in ['mean', 'max', 'min', 'median']: fill = data.eval('%s.%s()' % (k, c_expr)) data[k].fillna(fill, inplace=True) elif op == 'drop-na': params = eval(c_expr) dropna(data, **params) elif op == 'drop-unique': params = eval(c_expr) drop_columns_with_unique_values(data, **params) return data
def add_features(df: pd.DataFrame) -> pd.DataFrame: result = (df.eval( "team_1_score = team_1_powers * 15 + team_1_tens * 10 - team_1_negs * 5 + team_1_bonus_points" ).eval( "team_2_score = team_2_powers * 15 + team_2_tens * 10 - team_2_negs * 5 + team_2_bonus_points" ).eval("point_diff = team_1_score - team_2_score")) return result
def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame(np.random.randn(10, 2), columns=list("ab")) dict1 = {"a": 1} dict2 = {"b": 2} assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
def make_binary_array( data: pd.DataFrame, aggregation_level: Optional[AggregationLevel] = None, country=None, fips=None, state=None, states=None, on=None, after=None, before=None, ): """Create a binary array selecting rows in `data` matching the given parameters.""" query_parts = [] # aggregation_level is almost always set. The exception is `DatasetFilter` which is used to # get all data in the USA, at all aggregation levels. if aggregation_level: query_parts.append(f'aggregate_level == "{aggregation_level.value}"') if country: query_parts.append("country == @country") if state: query_parts.append("state == @state") if fips: query_parts.append("fips == @fips") if states: query_parts.append("state in @states") if on: query_parts.append("date == @on") if after: query_parts.append("date > @after") if before: query_parts.append("date < @before") return data.eval(" and ".join(query_parts))
def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame(randn(10, 2), columns=list('ab')) dict1 = {'a': 1} dict2 = {'b': 2} assert (df.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b']) assert (pd.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b'])
def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame(np.random.randn(10, 2), columns=list('ab')) dict1 = {'a': 1} dict2 = {'b': 2} assert (df.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b']) assert (pd.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b'])
def test_eval_resolvers_combined(self): # GH 34966 df = DataFrame(np.random.randn(10, 2), columns=list("ab")) dict1 = {"c": 2} # Both input and default index/column resolvers should be usable result = df.eval("a + b * c", resolvers=[dict1]) expected = df["a"] + df["b"] * dict1["c"] tm.assert_series_equal(result, expected)
def get_transition_probability(df_reference: pd.DataFrame, df_future: pd.DataFrame, source_group: str, target_group: str) -> pd.DataFrame: """ Obtains transition probabilities from a given reference patient group to other patient groups at any visit. Args: df_reference: The reference patient groups. df_future: The future patient groups. source_group: The reference patient group. target_group: The target patient group. Returns: The transition probability. """ df_reference = df_reference.query('classification == @source_group').drop( 'visit_id', axis=1).set_index('subject_id') df_future = df_future.set_index('subject_id') df_future = df_future.loc[df_reference.index & df_future.index] if df_future.shape[0] < 1: return None df_future.eval('is_target = (classification == @target_group)', inplace=True) future_merged = df_future.groupby('subject_id')['is_target'].max() return pd.DataFrame( { 'source': source_group, 'target': target_group, 'probability': future_merged.mean(), 'count': future_merged.sum() }, index=[0])
def filter_records(df: pd.DataFrame, criteria: str) -> pd.DataFrame: """ :param df: Data Set :param criteria: python condition :return: new Data Set """ try: result = df[df.eval(criteria)] return result except Exception as e: logger.log_error(e) return df
def make_rows_key( data: pd.DataFrame, aggregation_level: Optional[AggregationLevel] = None, country=None, fips=None, state=None, states=None, on=None, after=None, before=None, location_id_matches: Optional[str] = None, exclude_county_999: bool = False, exclude_fips_prefix: Optional[str] = None, ): """Create a binary array or slice selecting rows in `data` matching the given parameters.""" query_parts = [] # aggregation_level is almost always set. The exception is `DatasetFilter` which is used to # get all data in the USA, at all aggregation levels. if aggregation_level: query_parts.append(f'aggregate_level == "{aggregation_level.value}"') if country: query_parts.append("country == @country") if state: query_parts.append("state == @state") if fips: query_parts.append("fips == @fips") if states: query_parts.append("state in @states") if on: query_parts.append("date == @on") if after: query_parts.append("date > @after") if before: query_parts.append("date < @before") if exclude_county_999: # I don't think it is possible to use the default fast eval to match a substring. Instead # create a binary Series here and refer to it from the query. not_county_999 = data[CommonFields.FIPS].str[-3:] != "999" query_parts.append("@not_county_999") if location_id_matches: location_id_match_mask = data.index.get_level_values( CommonFields.LOCATION_ID).str.match(location_id_matches) query_parts.append("@location_id_match_mask") if exclude_fips_prefix: not_fips_prefix = data[ CommonFields.FIPS].str[0:2] != exclude_fips_prefix query_parts.append("@not_fips_prefix") if query_parts: return data.eval(" and ".join(query_parts)) else: # Select all rows return slice(None, None, None)
def order_cols(df: pd.DataFrame, cols: Mapping[str, str] = None) -> pd.DataFrame: """ At first adds special column 'i': row index, if it is used in cols.values :param df: :param cols: mapping out col names to expressions for pd.DataFrame.eval() (using input col names) or just input col names :return: """ df = df.copy() # Add row index to can eval expressions using it def i_term_is_used() -> bool: for in_col in cols.values(): for term in in_col.split(): if 'i' in term: return True return False if i_term_is_used(): df['i'] = np.arange( df.shape[0]) # pd.RangeIndex( , name='rec_num') same effect df_out = pd.DataFrame(index=df.index) #cols_use = omegaconf.OmegaConf.to_container(cols) # make editable copy # if cols_use.pop('rec_num', None): # 'rec_num' in df_out # df_out['rec_num'] = df['rec_num'] dict_rename = {} for out_col, in_col in cols.items(): if in_col.isidentifier() and not in_col in dict_rename: if in_col not in df.columns: df[in_col] = None dict_rename[in_col] = out_col else: df_out[out_col] = df.eval(in_col) df_to_rename = df[dict_rename.keys()] # removing index if exists because df.rename() renames only columns col_index = dict_rename.pop('index', None) # index will be placed in this column if col_index: df_out[col_index] = df_out.index df_out = df_out.join(df_to_rename.rename(columns=dict_rename, copy=False)) cols_iter = iter(cols.items()) index_name, in_1st_col = next(cols_iter) if 'index' not in in_1st_col: # original index is not at 1st column so need to be replaced df_out.set_index(index_name, inplace=True) return df_out[[k for k, v in cols_iter]] #df_out['DATE'] = df_out['DATE'].dt.tz_convert(None) return df_out[cols.keys()]
def _construct_bands(self, quotes: pd.DataFrame) -> pd.DataFrame: # Standard Bolling Bands Algorithm quotes['TP'] = quotes.eval("(high + low + close) / 3") quotes['std_dev'] = quotes['TP'].rolling(self.num_periods).std() quotes['band_center'] = self._get_band_center(quotes) quotes['band_upper'] = quotes['band_center'] + self.deviations * quotes['std_dev'] quotes['band_lower'] = quotes['band_center'] - self.deviations * quotes['std_dev'] # Long term rolling standard deviation, used to evaluate "consolidation periods". quotes['long_term_std'] = quotes['TP'].rolling(self.long_periods).std() return quotes
def __init__( self, quotes: pd.DataFrame, num_periods: int, deviations: float, long_periods: int = 60, pace: Optional[float] = None ) -> None: if pace is None: signal = quotes.eval("(high + low + close) / 3").values pace = signal.var() / np.correlate(signal, signal, 'valid') self.pace = pace super().__init__(quotes, num_periods, deviations, long_periods)
def __init__(self, quotes: pd.DataFrame, short_periods: int, long_periods: int, signal_periods: int, tolerance: float = 2e-1, pace: Optional[float] = None) -> None: if pace is None: signal = quotes.eval("(high + low + close) / 3").values pace = signal.var() / np.correlate(signal, signal, 'valid') self.pace = pace super().__init__(quotes, short_periods, long_periods, signal_periods, tolerance)
class TestDataFrameEvalWithFrame: def setup_method(self): self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc")) def teardown_method(self): del self.frame def test_simple_expr(self, parser, engine): res = self.frame.eval("a + b", engine=engine, parser=parser) expect = self.frame.a + self.frame.b tm.assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b tm.assert_series_equal(res, expect) @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser)
def plot_error_residuals(predictions: pd.DataFrame) -> None: points = (alt.Chart( predictions.eval("Residuals = predicted - real")).mark_circle( size=100).encode( alt.X("predicted", title="Predicted", scale=alt.Scale(zero=False)), alt.Y("Residuals", title="Residuals"), alt.Color("target"), )) rule = alt.Chart(pd.DataFrame([{ "zero": 0 }])).mark_rule().encode(alt.Y("zero")) st.altair_chart(points + rule, use_container_width=True)
def enrich_etf_summary(clean_etf_summary: pd.DataFrame, buy_params: Dict) -> pd.DataFrame: '''Create computed columns which can be used directly for determining whether to sell or buy''' vf = buy_params['volatility_factor'] summary = (clean_etf_summary.eval( 'pct_over_yhat_upper = (day_high - yhat_upper) / yhat_upper' ).eval('pct_below_yhat_lower = (yhat_lower - day_low) / yhat_lower').eval( 'sell_flag = day_high > yhat_upper' ).eval('buy_flag = day_low < yhat_lower').assign( dividend_decimal=lambda df: df.dividend_yield.str.strip('%').astype( float).fillna(0) / 100 ).eval( f'expected_return = 100 * {vf} * volatility + 100 * dividend_decimal')) return summary
def interaction_sums(labeled_interactions: pd.DataFrame) -> pd.DataFrame: '''Sum interaction strengths by type across the region.''' within_region = labeled_interactions.eval('region_from == region_to') across_regions = ~within_region labeled_interactions = labeled_interactions.assign( **{ TYPE_PREFIX + 'any_from': True, TYPE_PREFIX + 'any_to': True, }) unit_types = [ utype[:-5] if utype.endswith('_from') else utype[:-3] for utype in _get_available_types(labeled_interactions) ] regions = np.sort( np.array( list( set(labeled_interactions['region_from'].unique().tolist() + labeled_interactions['region_to'].unique().tolist())))) measures = pd.DataFrame([], index=regions) for from_unit_type in unit_types: for to_unit_type in unit_types: measure_name = _intersum_measure_name(from_unit_type, to_unit_type) measures[measure_name] = _sum_interactions(labeled_interactions[ labeled_interactions[f'{TYPE_PREFIX}{from_unit_type}_from'] & labeled_interactions[f'{TYPE_PREFIX}{to_unit_type}_to'] & within_region]).reindex(regions, fill_value=0) for unit_type in unit_types: measures[_intersum_measure_name(unit_type, 'out')] = _sum_interactions( labeled_interactions[ labeled_interactions[f'{TYPE_PREFIX}{unit_type}_from'] & across_regions]).reindex(regions, fill_value=0) measures[_intersum_measure_name(unit_type, 'all')] = _sum_interactions( labeled_interactions[labeled_interactions[ f'{TYPE_PREFIX}{unit_type}_from']]).reindex(regions, fill_value=0) measures[_intersum_measure_name('out', unit_type)] = _sum_interactions( labeled_interactions[ labeled_interactions[f'{TYPE_PREFIX}{unit_type}_to'] & across_regions], key='region_to').reindex(regions, fill_value=0) measures[_intersum_measure_name('all', unit_type)] = _sum_interactions( labeled_interactions[ labeled_interactions[f'{TYPE_PREFIX}{unit_type}_to']], key='region_to').reindex(regions, fill_value=0) return measures
def apply_filter_alert_by_epiweek(df: pd.DataFrame, view_name: str, epiweek: int = None): """ :param df: :param view_name: :param epiweek: :return: """ if epiweek is not None: mask = df.eval('epiweek=={}'.format(epiweek)) else: mask = df.keys() df_alert = df[mask].copy().reset_index() return df_alert
def get_transition_probability( X_reference: pd.DataFrame, X_future: pd.DataFrame, source_group: str, target_group: str, ) -> pd.DataFrame: """ Obtains transition probabilities from a given reference patient group to other patient groups at any visit. Args: X_reference: the reference patient groups X_future: the future patient groups source_group: the reference patient group target_group: the target patient group """ X_reference = (X_reference.query("classification == @source_group").drop( "visit_id", axis=1).set_index("subject_id")) X_future = X_future.set_index("subject_id").join(X_reference[[]], how="inner") if X_future.shape[0] < 1: return None X_future = X_future.eval("is_target = (classification == @target_group)") future_merged = X_future.groupby("subject_id")["is_target"].max() return pd.DataFrame( { "source": source_group, "target": target_group, "probability": future_merged.mean(), "count": future_merged.sum(), }, index=[0], )
def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_eval_object_dtype_binop(self): # GH#24883 df = DataFrame({"a1": ["Y", "N"]}) res = df.eval("c = ((a1 == 'Y') & True)") expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected)
def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser)
def integrities(interaction_sums: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({ name: interaction_sums.eval(expr) for name, expr in INTEGRITY_EXPRS.items() })