def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() self.assertIn('value', df) df = ser.reset_index(name='value2') self.assertIn('value2', df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) tm.assertIsInstance(rs, Series)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ["hash", "category"] ser.name = "value" df = ser.reset_index() self.assertIn("value", df) df = ser.reset_index(name="value2") self.assertIn("value2", df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) tm.assertIsInstance(rs, Series)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() assert 'value' in df df = ser.reset_index(name='value2') assert 'value2' in df # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) tm.assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 rs = s.reset_index(level=[0, 2], drop=True) tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series)
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values) comp = comp.tz_localize(None) tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected)
def test_reset_index_range(self): # GH 12071 s = Series(range(2), name='A', dtype='int64') series_result = s.reset_index() assert isinstance(series_result.index, RangeIndex) series_expected = DataFrame([[0, 0], [1, 1]], columns=['index', 'A'], index=RangeIndex(stop=2)) tm.assert_frame_equal(series_result, series_expected)
def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)], names=['a', 'b']) expected = ser.reset_index('b', drop=True) result = ser.droplevel('b', axis='index') tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): ser.droplevel(1, axis='columns')
def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) s1 = Series((9.81 * time**2) / 2, index=Index(time, name='time'), name='speed') df = DataFrame(s1) resetted = s1.reset_index() self.assertEqual(resetted['time'].dtype, np.float64) resetted = df.reset_index() self.assertEqual(resetted['time'].dtype, np.float64)
def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) ser.index = MultiIndex.from_arrays( [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] ) expected = ser.reset_index("b", drop=True) result = ser.droplevel("b", axis="index") tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): ser.droplevel(1, axis="columns")
def test_select_words_of_length(words: List[str], min_len: Optional[int], max_len: Optional[int], exp: pd.Series) -> None: """Test the _select_words_of_length with different cases.""" word_pool = WordPool(words) word_pool.select_words_of_length(min_len, max_len) obs: pd.Series = word_pool._pool_cleaned obs = obs.reset_index(drop=True) exp = exp.reset_index(drop=True) obs.equals(exp)
def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) s1 = Series((9.81 * time**2) / 2, index=Index(time, name="time"), name="speed") df = DataFrame(s1) resetted = s1.reset_index() assert resetted["time"].dtype == np.float64 resetted = df.reset_index() assert resetted["time"].dtype == np.float64
def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) s1 = Series((9.81 * time ** 2) / 2, index=Index(time, name='time'), name='speed') df = DataFrame(s1) resetted = s1.reset_index() assert resetted['time'].dtype == np.float64 resetted = df.reset_index() assert resetted['time'].dtype == np.float64
def __init__( self, data: pd.Series, kind: Literal['support', 'resistance'] = 'support' ): if not isinstance(data, pd.Series): raise TypeError('data should be pd.Series') self.y = data.reset_index(drop=True).rename('y').rename_axis('x') self.x = self.y.index.to_series() self.length = len(self.y) self.kind = kind self.dot_color = 'g' if kind == 'support' else 'r'
def get_flat_demand_cost(ser: pd.Series): idx, reported_val, max_val = get_interval_max_demand( ser.values, n_intervals=demand_window_intervals) ts = ser.reset_index()['index'].iloc[idx] # if we're on a holiday or weekend if ts in self.holidays or ts.dayofweek not in self.weekmask: return calculate_flat_cost(reported_val, ts.month, self.flat_demand_months, self.flat_demand_rates) # Otherwise, it's a weekday return calculate_flat_cost(reported_val, ts.month, self.flat_demand_months, self.flat_demand_rates)
def find_anomalies(volume_data: pd.Series, std_dev_cutoff: int = 10, days_cutoff: int = 3): """ Args: volume_data: std_dev_cutoff: Returns: """ volume_data.reset_index(level=0, inplace=True) volume_data.columns = ['date', 'volume'] cutoff_date = volume_data["date"].max() - pd.Timedelta(days=days_cutoff) data_std_dev = volume_data['volume'].std() data_mean = volume_data['volume'].mean() anomaly_cut_off = data_mean + data_std_dev * std_dev_cutoff anomaly_data = volume_data[volume_data['volume'] > anomaly_cut_off] anomaly_data['std_devs'] = (volume_data['volume'] - data_mean) / data_std_dev anomaly_data = anomaly_data[anomaly_data['date'] >= cutoff_date] # print(anomaly_data['date'].dtype) anomaly_data['date'] = anomaly_data['date'].astype(str).str[:10] return anomaly_data, data_mean, data_std_dev
def write_serie(self, serie: pd.Series, periodicity: str, fields: dict, writer: csv.writer): field_id = fields[serie.name] # Filtrado de NaN serie = serie[serie.first_valid_index():serie.last_valid_index()] df = serie.reset_index().apply(self.rows, axis=1, args=(self.fields_data, field_id, periodicity)) serie = pd.Series(df.values, index=serie.index) for row in serie: writer.writerow(row)
def pure_profit_score(close: Series) -> Tuple[float, int]: """Pure Profit Score of a series. Args: close (pd.Series): Series of 'close's >>> result = ta.pure_profit_score(df.close) """ close = verify_series(close) close_index = Series(0, index=close.reset_index().index) r = linear_regression(close_index, close)["r"] if r is not npNaN: return r * cagr(close) return 0
def __init__(self, data_id: pd.Series, task_id: int, target_name: pd.Series, task_target: pd.Series, shared_target: pd.Series, task_claim: pd.Series, shared_claim: pd.Series, task_attn_mask: pd.Series, shared_attn_mask: pd.Series, labels: pd.Series): self.data_id = [idx for idx in data_id] self.task_id = task_id self.target_name = target_name.reset_index(drop=True) self.task_target = [ids for ids in task_target] self.shared_target = [ids for ids in shared_target] self.task_claim = [ids for ids in task_claim] self.shared_claim = [ids for ids in shared_claim] self.task_attn_mask = [mask for mask in task_attn_mask] self.shared_attn_mask = [mask for mask in shared_attn_mask] self.label = [label for label in labels]
def create_infections_from_deaths( daily_deaths: pd.Series, pred_ifr: pd.Series, durations: Dict, ) -> pd.Series: daily_deaths = (daily_deaths.reset_index().groupby('location_id').apply( lambda x: pd.Series(x['daily_deaths'].rolling( window=7, min_periods=7, center=True).mean().values, index=x['date'])).dropna()) infections = ( daily_deaths / pred_ifr).rename('infections').dropna().sort_index().reset_index() infections['date'] -= pd.Timedelta(days=durations['exposure_to_death']) infections = infections.set_index(['location_id', 'date']) return infections
def pie_chart(data: pd.Series, colors: List[str] = None, title: str = None, plot_height: int = 250, plot_width=None, radius: int = 0.1, toolbar_location: str = 'right', x_range=None, show_legend=True) -> Figure: data = data.reset_index(name='value').rename(columns={'index': 'column'}) data['angle'] = data['value'] / data['value'].sum() * 2 * pi data['perc'] = data['value'] / data['value'].sum() * 100 if colors: data['color'] = colors else: data['color'] = Category20c[len( data)] if len(data) > 2 else Category20c[3][:2] p = figure( plot_height=plot_height, title=title, toolbar_location=toolbar_location, tools="pan,save,hover", tooltips="@column: @value (@perc%)", x_range=x_range, ) p.wedge(x=0, y=1, radius=radius, start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'), line_color="white", fill_color='color', legend_field='column', source=data) p.axis.axis_label = None p.axis.visible = False p.grid.grid_line_color = None p.legend.visible = show_legend if plot_width: p.width = plot_width return p
def pd_column_analysis(ds: pd.Series, value_col: str = "values", sort: str = None) -> pd.DataFrame: """Returns formatted dataframe from standard column analysis""" df = ds.reset_index().rename(columns={"index": "columns", 0: value_col}) if sort is None: return df if sort.lower().startswith("asc"): df = df.sort_values(by=value_col, ascending=True) if sort.lower().startswith("desc"): df = df.sort_values(by=value_col, ascending=False) return df.reset_index(drop=True)
def __resampleData(self, data: pd.Series, resampleFreq: str, aggStrategy: str) -> pd.Series: if len(data) == 0: return data if pd.isna(resampleFreq): return data if not (resampleFreq.lower() in ['s', 'm', 'b', 'h', 'd']): return data if pd.isna(aggStrategy) or (aggStrategy.lower() == 'raw'): return data # storing series labels seriesName = data.name indName = data.index.name # changing series labels data.name = 'vals' data.index.name = 'times' data = data.reset_index() # modify times as per resampleFreq # https://stackoverflow.com/questions/43400331/remove-seconds-and-minutes-from-a-pandas-dataframe-column if resampleFreq.lower() == 'd': data = data.assign(times=data.times.dt.floor('D')) elif resampleFreq.lower() == 'h': data = data.assign(times=data.times.dt.floor('H')) elif resampleFreq.lower() == 'm': data = data.assign(times=data.times.dt.floor('min')) elif resampleFreq.lower() == 's': data = data.assign(times=data.times.dt.floor('S')) elif resampleFreq.lower() == 'b': data = data.assign(times=data.times.dt.floor('min')) data.times = data.times.map( lambda x: x.replace(minute=(x.minute - x.minute % 15))) # aggregate the samples based on times if aggStrategy.lower() == 'snap': data = data.groupby('times', as_index=False).first() elif aggStrategy.lower() == 'average': data = data.groupby('times', as_index=False).mean() data = pd.Series(data.vals.values, index=data.times.values) # restore original labels data.name = seriesName data.index.name = indName return data
def _clean_timeseries(observed_ts: pd.Series) -> pd.Series: """Clean and Normalize time_series for subsequent processing. The following is performed on the time_series: - index_reset - duplicates dropped - na values dropped Args: observed_ts (Series): The time_series to normalize. Returns: (Series): The normalized time_series """ observed_ts = observed_ts.reset_index(drop=True) observed_ts = observed_ts.drop_duplicates() observed_ts = observed_ts.dropna() return observed_ts
def get_demand_cost(ser: pd.Series): idx, reported_val, max_val = get_interval_max_demand( ser.values, n_intervals=demand_window_intervals) ts = ser.reset_index()['index'].iloc[idx] cost = 0.0 try: # if we're on a holiday or weekend if ts.date( ) in self.holidays or ts.dayofweek not in self.weekmask: cost = calculate_tou_cost(reported_val, ts.month, ts.hour, self.demand_weekend_schedule, self.demand_rates) # Otherwise, it's a weekday cost = calculate_tou_cost(reported_val, ts.month, ts.hour, self.demand_weekday_schedule, self.demand_rates) except: pass
def __init__(self, master: pd.Series, duplicates: Optional[pd.Series] = None, master_id: Optional[pd.Series] = None, duplicates_id: Optional[pd.Series] = None, **kwargs): """ StringGrouper is a class that holds the matrix with cosine similarities between the master and duplicates matrix. If duplicates is not given it is replaced by master. To build this matrix the `fit` function must be called. It is possible to add and remove matches after building with the add_match and remove_match functions :param master: pandas.Series. A series of strings in which similar strings are searched, either against itself or against the `duplicates` series. :param duplicates: pandas.Series. If set, for each string in duplicates a similar string is searched in Master. :param master_id: pandas.Series. If set, contains ID values for each row in master series. :param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates series. :param kwargs: All other keyword arguments are passed to StringGrouperConfig """ # Validate match strings input if not StringGrouper._is_series_of_strings(master) or \ (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)): raise TypeError( 'Input does not consist of pandas.Series containing only Strings' ) # Validate optional IDs input if not StringGrouper._is_input_data_combination_valid( duplicates, master_id, duplicates_id): raise Exception('List of data Series options is invalid') StringGrouper._validate_id_data(master, duplicates, master_id, duplicates_id) self._master: pd.Series = master.reset_index(drop=True) self._duplicates: pd.Series = duplicates.reset_index( drop=True) if duplicates is not None else None self._master_id: pd.Series = master_id.reset_index( drop=True) if master_id is not None else None self._duplicates_id: pd.Series = duplicates_id.reset_index( drop=True) if duplicates_id is not None else None self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) self.is_build = False # indicates if the grouper was fit or not self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams) # After the StringGrouper is build, _matches_list will contain the indices and similarities of two matches self._matches_list: pd.DataFrame = pd.DataFrame()
def _add_label_and_progress( s: pd.Series, pretty_models: Mapping[str, common_config.RewardCfg]) -> pd.DataFrame: """Add pretty label and checkpoint progress to reward distances.""" labels = s.index.map( functools.partial(_pretty_label, pretty_mapping=pretty_models)) df = s.reset_index(name="Distance") regex = ".*/checkpoints/(?P<Checkpoint>final|[0-9]+)(?:/.*)?$" match = df["source_reward_path"].str.extract(regex) match["Reward"] = labels grp = match.groupby("Reward") progress = grp.apply(_checkpoint_to_progress) progress = progress.reset_index("Reward", drop=True) df["Progress"] = progress df["Reward"] = labels return df
def test_reset_index_drop_errors(self): # GH 20925 # KeyError raised for series index when passed level name is missing s = Series(range(4)) with pytest.raises(KeyError, match='must be same as name'): s.reset_index('wrong', drop=True) with pytest.raises(KeyError, match='must be same as name'): s.reset_index('wrong') # KeyError raised for series when level to be dropped is missing s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) with pytest.raises(KeyError, match='not found'): s.reset_index('wrong', drop=True)
def test_reset_index_drop_errors(self): # GH 20925 # KeyError raised for series index when passed level name is missing s = Series(range(4)) with pytest.raises(KeyError, match="does not match index name"): s.reset_index("wrong", drop=True) with pytest.raises(KeyError, match="does not match index name"): s.reset_index("wrong") # KeyError raised for series when level to be dropped is missing s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True)
def test_factor_rank_autocorrelation(self, factor_values, sector_values, end_date, time_rule, by_sector, expected_vals): dr = date_range(start='2015-1-1', end=end_date) dr.name = 'date' tickers = ['A', 'B', 'C', 'D'] factor_df = DataFrame(index=dr, columns=tickers, data=factor_values).stack() factor_df.index = factor_df.index.set_names(['date', 'asset']) factor = Series(factor_df) factor.name = 'factor' factor = factor.reset_index() factor['sector'] = sector_values factor = factor.set_index(['date', 'asset', 'sector']).factor fa = factor_rank_autocorrelation(factor, time_rule, by_sector) expected = Series(index=fa.index, data=expected_vals) assert_series_equal(fa, expected)
def test_factor_rank_autocorrelation(self, factor_values, group_values, end_date, time_rule, by_group, expected_vals): dr = date_range(start='2015-1-1', end=end_date) dr.name = 'date' tickers = ['A', 'B', 'C', 'D'] factor_df = DataFrame(index=dr, columns=tickers, data=factor_values)\ .stack() factor_df.index = factor_df.index.set_names(['date', 'asset']) factor = Series(factor_df) factor.name = 'factor' factor = factor.reset_index() factor['group'] = group_values factor = factor.set_index(['date', 'asset', 'group']).factor fa = factor_rank_autocorrelation(factor, time_rule, by_group) expected = Series(index=fa.index, data=expected_vals) assert_series_equal(fa, expected)
def get_sub_loc_deaths( sub_location: int, n_draws: int, sub_infections_draws: pd.DataFrame, ifr: pd.DataFrame, durations: List[Dict], reported_deaths: pd.Series, ) -> Tuple[pd.DataFrame, pd.Series]: if sub_location in reported_deaths.reset_index()['location_id'].to_list(): reported_deaths = reported_deaths.loc[sub_location] else: reported_deaths = 1 loc_deaths = [] loc_scalar = [] for draw in range(n_draws): _ifr = ifr.loc[sub_location, draw]['ratio'] _deaths = sub_infections_draws.loc[sub_location, f'draw_{draw}'].reset_index() _deaths['date'] += pd.Timedelta( days=durations[draw]['exposure_to_death']) _deaths = _deaths.set_index('date').loc[:, f'draw_{draw}'] _deaths = (_deaths * _ifr).dropna().rename(f'draw_{draw}') trim_days = durations[draw]['exposure_to_death'] - durations[draw][ 'exposure_to_case'] _deaths = _deaths[:-trim_days] loc_scalar.append(_deaths.sum() / reported_deaths) loc_deaths.append(_deaths) loc_deaths = pd.concat(loc_deaths, axis=1).dropna() loc_deaths['location_id'] = sub_location loc_deaths = (loc_deaths.reset_index().set_index(['location_id', 'date']).sort_index()) loc_scalar = pd.DataFrame({ 'draw': list(range(n_draws)), 'location_id': sub_location, 'em_scalar': loc_scalar, }) loc_scalar = (loc_scalar.set_index(['draw', 'location_id' ]).sort_index().loc[:, 'em_scalar']) return loc_deaths, loc_scalar
def test_merge_multiple_cols_with_mixed_cols_index(self): # GH29522 s = Series( range(6), MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), name="Amount", ) df = DataFrame({ "lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0 }) result = merge(df, s.reset_index(), on=["lev1", "lev2"]) expected = DataFrame({ "lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": [0] * 6, "Amount": range(6), }) tm.assert_frame_equal(result, expected)
def group_rep_transform(method: str, weights: pd.Series, grouped_data, group_col, record_id_col, record_name_col) -> Union[pd.Series, pd.DataFrame]: stashed_index = grouped_data.index group_of_master_id = get_column(group_col, grouped_data).reset_index(drop=True) group_of_master_id = group_of_master_id.rename( 'raw_group_id').reset_index().rename(columns={'index': 'weight'}) group_of_master_id['weight'] = weights.reset_index(drop=True) group_of_master_id['group_rep'] = \ group_of_master_id.groupby('raw_group_id', sort=False)['weight'].transform(method) record_id_col = get_column(record_id_col, grouped_data) new_rep = record_id_col.iloc[group_of_master_id.group_rep].reset_index( drop=True).rename(None) if record_name_col is None: output = new_rep else: record_name_col = get_column(record_name_col, grouped_data) new_rep_name = record_name_col.iloc[ group_of_master_id.group_rep].reset_index(drop=True).rename(None) output = pd.concat([new_rep, new_rep_name], axis=1) output.index = stashed_index return output
def distribution_table( self, data: Optional[Series] = None, top: int = 25, ) -> DataFrame: """ Return a table of the top words found in answers given to the Question. :param data: Optional Series containing response texts. :param top: Number of words to return counts for. """ data = data if data is not None else self._data if data is None: raise ValueError('No data!') words = pre_process_text_series(data) value_counts = Series(words).value_counts()[:top].rename('Count') value_counts.index.name = 'Word' word_counts = ( value_counts.reset_index().sort_values('Word').sort_values( 'Count', ascending=False).reset_index()) word_counts = word_counts.sort_values( ['Count', 'Word'], ascending=[False, True]).reset_index()[['Word', 'Count']] return word_counts
def variants_vaccines(rate_age_pattern: pd.Series, denom_age_pattern: pd.Series, age_spec_population: pd.Series, rate: pd.Series, day_shift: int, escape_variant_prevalence: pd.Series, severity_variant_prevalence: pd.Series, vaccine_coverage: pd.DataFrame, population: pd.Series, variant_risk_ratio: float, verbose: bool = True,): escape_variant_prevalence = escape_variant_prevalence.reset_index() escape_variant_prevalence['date'] += pd.Timedelta(days=day_shift) escape_variant_prevalence = (escape_variant_prevalence .set_index(['location_id', 'date']) .loc[:, 'escape_variant_prevalence']) escape_variant_prevalence = pd.concat([rate, escape_variant_prevalence], axis=1) # borrow axis escape_variant_prevalence = escape_variant_prevalence['escape_variant_prevalence'].fillna(0) severity_variant_prevalence = severity_variant_prevalence.reset_index() severity_variant_prevalence['date'] += pd.Timedelta(days=day_shift) severity_variant_prevalence = (severity_variant_prevalence .set_index(['location_id', 'date']) .loc[:, 'severity_variant_prevalence']) severity_variant_prevalence = pd.concat([rate, severity_variant_prevalence], axis=1) # borrow axis severity_variant_prevalence = severity_variant_prevalence['severity_variant_prevalence'].fillna(0) lr_e = [f'cumulative_lr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']] lr_ep = [f'cumulative_lr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']] hr_e = [f'cumulative_hr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']] hr_ep = [f'cumulative_hr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']] vaccine_coverage = (vaccine_coverage .loc[:, lr_e + lr_ep + hr_e + hr_ep] .reset_index()) vaccine_coverage['date'] += pd.Timedelta(days=day_shift) vaccine_coverage = vaccine_coverage.set_index(['location_id', 'date']) vaccine_coverage = pd.concat([rate.rename('rate'), vaccine_coverage], axis=1) # borrow axis del vaccine_coverage['rate'] vaccine_coverage = vaccine_coverage.fillna(0) # not super necessary... numerator = pd.Series(100, index=rate.index) numerator /= population denominator_a = (numerator / rate) denominator_ev = (numerator / (rate * variant_risk_ratio)) denominator_sv = denominator_ev.copy() denominator_a *= (1 - (escape_variant_prevalence + severity_variant_prevalence)[denominator_a.index]) denominator_ev *= escape_variant_prevalence[denominator_ev.index] denominator_sv *= severity_variant_prevalence[denominator_sv.index] numerator_a = (rate * denominator_a) numerator_ev = (rate * variant_risk_ratio * denominator_ev) numerator_sv = (rate * variant_risk_ratio * denominator_sv) if verbose: logger.info('Adjusting ancestral...') numerator_lr_a, numerator_hr_a, denominator_lr_a, denominator_hr_a = adjust_by_variant_classification( numerator=numerator_a, denominator=denominator_a, variant_suffixes=['wildtype', 'variant',], rate_age_pattern=rate_age_pattern, denom_age_pattern=denom_age_pattern, age_spec_population=age_spec_population, vaccine_coverage=vaccine_coverage, population=population, ) if verbose: logger.info('Adjusting non-escape...') numerator_lr_sv, numerator_hr_sv, denominator_lr_sv, denominator_hr_sv = adjust_by_variant_classification( numerator=numerator_sv, denominator=denominator_sv, variant_suffixes=['wildtype', 'variant'], rate_age_pattern=rate_age_pattern, denom_age_pattern=denom_age_pattern, age_spec_population=age_spec_population, vaccine_coverage=vaccine_coverage, population=population, ) if verbose: logger.info('Adjusting escape...') numerator_lr_ev, numerator_hr_ev, denominator_lr_ev, denominator_hr_ev = adjust_by_variant_classification( numerator=numerator_ev, denominator=denominator_ev, variant_suffixes=['variant',], rate_age_pattern=rate_age_pattern, denom_age_pattern=denom_age_pattern, age_spec_population=age_spec_population, vaccine_coverage=vaccine_coverage, population=population, ) numerator_lr = numerator_lr_a + numerator_lr_ev + numerator_lr_sv denominator_lr = denominator_lr_a + denominator_lr_ev + denominator_lr_sv numerator_hr = numerator_hr_a + numerator_hr_ev + numerator_hr_sv denominator_hr = denominator_hr_a + denominator_hr_ev + denominator_hr_sv rate = (numerator_lr + numerator_hr) / (denominator_lr + denominator_hr) rate_lr = numerator_lr / denominator_lr rate_hr = numerator_hr / denominator_hr pct_inf_lr = denominator_lr / (denominator_lr + denominator_hr) pct_inf_hr = denominator_hr / (denominator_lr + denominator_hr) return rate, rate_lr, rate_hr, pct_inf_lr, pct_inf_hr
def test_set_index_cast_datetimeindex(self): df = DataFrame({ 'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000) }) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series( np.array([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame([{ 'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1 }]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame({ 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E'] }, index=idx) expected = pd.DataFrame( { 'idx': [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5) ], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E'] }, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
def _fit(self, X: pd.DataFrame, y: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, sample_weight=None, verbosity=2, **kwargs): # try_import_mxnet() try_import_autogluon_vision() from autogluon.vision import ImagePredictor params = self._get_model_params() X = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) if sample_weight is not None: # TODO: support logger.log( 15, "\tsample_weight not yet supported for ImagePredictorModel, this model will ignore them in training." ) X = X.reset_index(drop=True) y = y.reset_index(drop=True) if X_val is not None: X_val = X_val.reset_index(drop=True) y_val = y_val.reset_index(drop=True) X[self._label_column_name] = y if X_val is not None: X_val[self._label_column_name] = y_val null_indices = X['image'] == '' # TODO: Consider some kind of weighting of the two options so there isn't a harsh cutoff at 50 # FIXME: What if all rows in a class are null? Will probably crash. if null_indices.sum() > 50: self._dummy_pred_proba = self._compute_dummy_pred_proba( y[null_indices]) # FIXME: Do this one for better results else: # Not enough null to get a confident estimate of null label average, instead use all data average self._dummy_pred_proba = self._compute_dummy_pred_proba(y) if null_indices.sum() > 0: X = X[~null_indices] if X_val is not None: null_indices_val = X_val['image'] == '' if null_indices_val.sum() > 0: X_val = X_val[~null_indices_val] verbosity_image = max(0, verbosity - 1) # TODO: ImagePredictor doesn't use problem_type in any way at present. # It also doesn't error or warn if problem_type is not one it expects. self.model = ImagePredictor( problem_type=self.problem_type, path=self.path, # eval_metric=self.eval_metric, # TODO: multiclass/binary vision problem works only with accuracy, regression with rmse verbosity=verbosity_image) logger.log(15, f'\tHyperparameters: {params}') # FIXME: ImagePredictor crashes if given float time_limit if time_limit is not None: time_limit = int(time_limit) self.model.fit(train_data=X, tuning_data=X_val, time_limit=time_limit, hyperparameters=params, random_state=0)
def window_agg_udf( grouped_data: SeriesGroupBy, function: Callable, window_lower_indices: pd.Series, window_upper_indices: pd.Series, mask: pd.Series, result_index: pd.Index, dtype: np.dtype, max_lookback: int, *args: Tuple[Any], **kwargs: Dict[str, Any], ) -> pd.Series: """Apply window aggregation with UDFs. Notes: Use custom logic to computing rolling window UDF instead of using pandas's rolling function. This is because pandas's rolling function doesn't support multi param UDFs. """ assert len(window_lower_indices) == len(window_upper_indices) assert len(window_lower_indices) == len(mask) # Reset index here so we don't need to deal with mismatching # indices window_lower_indices = window_lower_indices.reset_index(drop=True) window_upper_indices = window_upper_indices.reset_index(drop=True) mask = mask.reset_index(drop=True) # Compute window indices and manually roll # over the window. # If an window has only nan values, we output nan for # the window result. This follows pandas rolling apply # behavior. # The first input column is in grouped_data, but there may # be additional input columns in args. inputs = (grouped_data,) + args masked_window_lower_indices = window_lower_indices[mask].astype('i8') masked_window_upper_indices = window_upper_indices[mask].astype('i8') input_iters = list( create_window_input_iter( arg, masked_window_lower_indices, masked_window_upper_indices ) if isinstance(arg, (pd.Series, SeriesGroupBy)) else itertools.repeat(arg) for arg in inputs ) valid_result = pd.Series( function(*(next(gen) for gen in input_iters)) for i in range(len(masked_window_lower_indices)) ) valid_result = pd.Series(valid_result) valid_result.index = masked_window_lower_indices.index result = pd.Series(index=mask.index, dtype=dtype) result[mask] = valid_result result.index = result_index return result
def test_reset_index_name(self): s = Series([1, 2, 3], index=Index(range(3), name="x")) assert s.reset_index().index.name is None assert s.reset_index(drop=True).index.name is None
def test_reset_index_name(self): s = Series([1, 2, 3], index=Index(range(3), name='x')) assert s.reset_index().index.name is None assert s.reset_index(drop=True).index.name is None
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)