Пример #1
0
    def test_reset_index(self):
        df = tm.makeDataFrame()[:5]
        ser = df.stack()
        ser.index.names = ['hash', 'category']

        ser.name = 'value'
        df = ser.reset_index()
        self.assertIn('value', df)

        df = ser.reset_index(name='value2')
        self.assertIn('value2', df)

        # check inplace
        s = ser.reset_index(drop=True)
        s2 = ser
        s2.reset_index(drop=True, inplace=True)
        assert_series_equal(s, s2)

        # level
        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        rs = s.reset_index(level=1)
        self.assertEqual(len(rs.columns), 2)

        rs = s.reset_index(level=[0, 2], drop=True)
        self.assertTrue(rs.index.equals(Index(index.get_level_values(1))))
        tm.assertIsInstance(rs, Series)
Пример #2
0
    def test_reset_index(self):
        df = tm.makeDataFrame()[:5]
        ser = df.stack()
        ser.index.names = ["hash", "category"]

        ser.name = "value"
        df = ser.reset_index()
        self.assertIn("value", df)

        df = ser.reset_index(name="value2")
        self.assertIn("value2", df)

        # check inplace
        s = ser.reset_index(drop=True)
        s2 = ser
        s2.reset_index(drop=True, inplace=True)
        assert_series_equal(s, s2)

        # level
        index = MultiIndex(
            levels=[["bar"], ["one", "two", "three"], [0, 1]],
            labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
        )
        s = Series(np.random.randn(6), index=index)
        rs = s.reset_index(level=1)
        self.assertEqual(len(rs.columns), 2)

        rs = s.reset_index(level=[0, 2], drop=True)
        self.assertTrue(rs.index.equals(Index(index.get_level_values(1))))
        tm.assertIsInstance(rs, Series)
Пример #3
0
    def test_reset_index(self):
        df = tm.makeDataFrame()[:5]
        ser = df.stack()
        ser.index.names = ['hash', 'category']

        ser.name = 'value'
        df = ser.reset_index()
        assert 'value' in df

        df = ser.reset_index(name='value2')
        assert 'value2' in df

        # check inplace
        s = ser.reset_index(drop=True)
        s2 = ser
        s2.reset_index(drop=True, inplace=True)
        tm.assert_series_equal(s, s2)

        # level
        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                  [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        rs = s.reset_index(level=1)
        assert len(rs.columns) == 2

        rs = s.reset_index(level=[0, 2], drop=True)
        tm.assert_index_equal(rs.index, Index(index.get_level_values(1)))
        assert isinstance(rs, Series)
Пример #4
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values)
        comp = comp.tz_localize(None)
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)
Пример #5
0
 def test_reset_index_range(self):
     # GH 12071
     s = Series(range(2), name='A', dtype='int64')
     series_result = s.reset_index()
     assert isinstance(series_result.index, RangeIndex)
     series_expected = DataFrame([[0, 0], [1, 1]],
                                 columns=['index', 'A'],
                                 index=RangeIndex(stop=2))
     tm.assert_frame_equal(series_result, series_expected)
Пример #6
0
 def test_droplevel(self):
     # GH20342
     ser = Series([1, 2, 3, 4])
     ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)],
                                        names=['a', 'b'])
     expected = ser.reset_index('b', drop=True)
     result = ser.droplevel('b', axis='index')
     tm.assert_series_equal(result, expected)
     # test that droplevel raises ValueError on axis != 0
     with pytest.raises(ValueError):
         ser.droplevel(1, axis='columns')
Пример #7
0
    def test_reset_index_right_dtype(self):
        time = np.arange(0.0, 10, np.sqrt(2) / 2)
        s1 = Series((9.81 * time**2) / 2,
                    index=Index(time, name='time'),
                    name='speed')
        df = DataFrame(s1)

        resetted = s1.reset_index()
        self.assertEqual(resetted['time'].dtype, np.float64)

        resetted = df.reset_index()
        self.assertEqual(resetted['time'].dtype, np.float64)
Пример #8
0
 def test_droplevel(self):
     # GH20342
     ser = Series([1, 2, 3, 4])
     ser.index = MultiIndex.from_arrays(
         [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"]
     )
     expected = ser.reset_index("b", drop=True)
     result = ser.droplevel("b", axis="index")
     tm.assert_series_equal(result, expected)
     # test that droplevel raises ValueError on axis != 0
     with pytest.raises(ValueError):
         ser.droplevel(1, axis="columns")
Пример #9
0
def test_select_words_of_length(words: List[str], min_len: Optional[int],
                                max_len: Optional[int],
                                exp: pd.Series) -> None:
    """Test the _select_words_of_length with different cases."""

    word_pool = WordPool(words)
    word_pool.select_words_of_length(min_len, max_len)
    obs: pd.Series = word_pool._pool_cleaned

    obs = obs.reset_index(drop=True)
    exp = exp.reset_index(drop=True)
    obs.equals(exp)
Пример #10
0
    def test_reset_index_right_dtype(self):
        time = np.arange(0.0, 10, np.sqrt(2) / 2)
        s1 = Series((9.81 * time**2) / 2,
                    index=Index(time, name="time"),
                    name="speed")
        df = DataFrame(s1)

        resetted = s1.reset_index()
        assert resetted["time"].dtype == np.float64

        resetted = df.reset_index()
        assert resetted["time"].dtype == np.float64
Пример #11
0
    def test_reset_index_right_dtype(self):
        time = np.arange(0.0, 10, np.sqrt(2) / 2)
        s1 = Series((9.81 * time ** 2) / 2,
                    index=Index(time, name='time'),
                    name='speed')
        df = DataFrame(s1)

        resetted = s1.reset_index()
        assert resetted['time'].dtype == np.float64

        resetted = df.reset_index()
        assert resetted['time'].dtype == np.float64
Пример #12
0
    def __init__(
        self, data: pd.Series, kind: Literal['support', 'resistance'] = 'support'
    ):
        if not isinstance(data, pd.Series):
            raise TypeError('data should be pd.Series')

        self.y = data.reset_index(drop=True).rename('y').rename_axis('x')
        self.x = self.y.index.to_series()

        self.length = len(self.y)

        self.kind = kind
        self.dot_color = 'g' if kind == 'support' else 'r'
Пример #13
0
 def get_flat_demand_cost(ser: pd.Series):
     idx, reported_val, max_val = get_interval_max_demand(
         ser.values, n_intervals=demand_window_intervals)
     ts = ser.reset_index()['index'].iloc[idx]
     # if we're on a holiday or weekend
     if ts in self.holidays or ts.dayofweek not in self.weekmask:
         return calculate_flat_cost(reported_val, ts.month,
                                    self.flat_demand_months,
                                    self.flat_demand_rates)
     # Otherwise, it's a weekday
     return calculate_flat_cost(reported_val, ts.month,
                                self.flat_demand_months,
                                self.flat_demand_rates)
Пример #14
0
def find_anomalies(volume_data: pd.Series, std_dev_cutoff: int = 10, days_cutoff: int = 3):
    """

    Args:
        volume_data:
        std_dev_cutoff:

    Returns:

    """
    volume_data.reset_index(level=0, inplace=True)
    volume_data.columns = ['date', 'volume']
    cutoff_date = volume_data["date"].max() - pd.Timedelta(days=days_cutoff)

    data_std_dev = volume_data['volume'].std()
    data_mean = volume_data['volume'].mean()
    anomaly_cut_off = data_mean + data_std_dev * std_dev_cutoff
    anomaly_data = volume_data[volume_data['volume'] > anomaly_cut_off]
    anomaly_data['std_devs'] = (volume_data['volume'] - data_mean) / data_std_dev
    anomaly_data = anomaly_data[anomaly_data['date'] >= cutoff_date]
    # print(anomaly_data['date'].dtype)
    anomaly_data['date'] = anomaly_data['date'].astype(str).str[:10]
    return anomaly_data, data_mean, data_std_dev
    def write_serie(self, serie: pd.Series, periodicity: str, fields: dict,
                    writer: csv.writer):
        field_id = fields[serie.name]

        # Filtrado de NaN
        serie = serie[serie.first_valid_index():serie.last_valid_index()]

        df = serie.reset_index().apply(self.rows,
                                       axis=1,
                                       args=(self.fields_data, field_id,
                                             periodicity))

        serie = pd.Series(df.values, index=serie.index)
        for row in serie:
            writer.writerow(row)
Пример #16
0
def pure_profit_score(close: Series) -> Tuple[float, int]:
    """Pure Profit Score of a series.

    Args:
        close (pd.Series): Series of 'close's

    >>> result = ta.pure_profit_score(df.close)
    """
    close = verify_series(close)
    close_index = Series(0, index=close.reset_index().index)

    r = linear_regression(close_index, close)["r"]
    if r is not npNaN:
        return r * cagr(close)
    return 0
Пример #17
0
    def __init__(self, data_id: pd.Series, task_id: int,
                 target_name: pd.Series, task_target: pd.Series,
                 shared_target: pd.Series, task_claim: pd.Series,
                 shared_claim: pd.Series, task_attn_mask: pd.Series,
                 shared_attn_mask: pd.Series, labels: pd.Series):

        self.data_id = [idx for idx in data_id]
        self.task_id = task_id
        self.target_name = target_name.reset_index(drop=True)
        self.task_target = [ids for ids in task_target]
        self.shared_target = [ids for ids in shared_target]
        self.task_claim = [ids for ids in task_claim]
        self.shared_claim = [ids for ids in shared_claim]
        self.task_attn_mask = [mask for mask in task_attn_mask]
        self.shared_attn_mask = [mask for mask in shared_attn_mask]
        self.label = [label for label in labels]
Пример #18
0
def create_infections_from_deaths(
    daily_deaths: pd.Series,
    pred_ifr: pd.Series,
    durations: Dict,
) -> pd.Series:
    daily_deaths = (daily_deaths.reset_index().groupby('location_id').apply(
        lambda x: pd.Series(x['daily_deaths'].rolling(
            window=7, min_periods=7, center=True).mean().values,
                            index=x['date'])).dropna())

    infections = (
        daily_deaths /
        pred_ifr).rename('infections').dropna().sort_index().reset_index()
    infections['date'] -= pd.Timedelta(days=durations['exposure_to_death'])
    infections = infections.set_index(['location_id', 'date'])

    return infections
Пример #19
0
def pie_chart(data: pd.Series,
              colors: List[str] = None,
              title: str = None,
              plot_height: int = 250,
              plot_width=None,
              radius: int = 0.1,
              toolbar_location: str = 'right',
              x_range=None,
              show_legend=True) -> Figure:
    data = data.reset_index(name='value').rename(columns={'index': 'column'})
    data['angle'] = data['value'] / data['value'].sum() * 2 * pi
    data['perc'] = data['value'] / data['value'].sum() * 100
    if colors:
        data['color'] = colors
    else:
        data['color'] = Category20c[len(
            data)] if len(data) > 2 else Category20c[3][:2]

    p = figure(
        plot_height=plot_height,
        title=title,
        toolbar_location=toolbar_location,
        tools="pan,save,hover",
        tooltips="@column: @value (@perc%)",
        x_range=x_range,
    )

    p.wedge(x=0,
            y=1,
            radius=radius,
            start_angle=cumsum('angle', include_zero=True),
            end_angle=cumsum('angle'),
            line_color="white",
            fill_color='color',
            legend_field='column',
            source=data)

    p.axis.axis_label = None
    p.axis.visible = False
    p.grid.grid_line_color = None
    p.legend.visible = show_legend

    if plot_width:
        p.width = plot_width

    return p
Пример #20
0
def pd_column_analysis(ds: pd.Series,
                       value_col: str = "values",
                       sort: str = None) -> pd.DataFrame:
    """Returns formatted dataframe from standard column analysis"""

    df = ds.reset_index().rename(columns={"index": "columns", 0: value_col})

    if sort is None:
        return df

    if sort.lower().startswith("asc"):
        df = df.sort_values(by=value_col, ascending=True)

    if sort.lower().startswith("desc"):
        df = df.sort_values(by=value_col, ascending=False)

    return df.reset_index(drop=True)
Пример #21
0
    def __resampleData(self, data: pd.Series, resampleFreq: str,
                       aggStrategy: str) -> pd.Series:
        if len(data) == 0:
            return data
        if pd.isna(resampleFreq):
            return data
        if not (resampleFreq.lower() in ['s', 'm', 'b', 'h', 'd']):
            return data
        if pd.isna(aggStrategy) or (aggStrategy.lower() == 'raw'):
            return data

        # storing series labels
        seriesName = data.name
        indName = data.index.name

        # changing series labels
        data.name = 'vals'
        data.index.name = 'times'
        data = data.reset_index()
        # modify times as per resampleFreq
        # https://stackoverflow.com/questions/43400331/remove-seconds-and-minutes-from-a-pandas-dataframe-column
        if resampleFreq.lower() == 'd':
            data = data.assign(times=data.times.dt.floor('D'))
        elif resampleFreq.lower() == 'h':
            data = data.assign(times=data.times.dt.floor('H'))
        elif resampleFreq.lower() == 'm':
            data = data.assign(times=data.times.dt.floor('min'))
        elif resampleFreq.lower() == 's':
            data = data.assign(times=data.times.dt.floor('S'))
        elif resampleFreq.lower() == 'b':
            data = data.assign(times=data.times.dt.floor('min'))
            data.times = data.times.map(
                lambda x: x.replace(minute=(x.minute - x.minute % 15)))

        # aggregate the samples based on times
        if aggStrategy.lower() == 'snap':
            data = data.groupby('times', as_index=False).first()
        elif aggStrategy.lower() == 'average':
            data = data.groupby('times', as_index=False).mean()
        data = pd.Series(data.vals.values, index=data.times.values)

        # restore original labels
        data.name = seriesName
        data.index.name = indName
        return data
Пример #22
0
def _clean_timeseries(observed_ts: pd.Series) -> pd.Series:
    """Clean and Normalize time_series for subsequent processing. The following is performed on the time_series:

    - index_reset
    - duplicates dropped
    - na values dropped

    Args:
        observed_ts (Series): The time_series to normalize.

    Returns:
        (Series): The normalized time_series
    """

    observed_ts = observed_ts.reset_index(drop=True)
    observed_ts = observed_ts.drop_duplicates()
    observed_ts = observed_ts.dropna()
    return observed_ts
Пример #23
0
 def get_demand_cost(ser: pd.Series):
     idx, reported_val, max_val = get_interval_max_demand(
         ser.values, n_intervals=demand_window_intervals)
     ts = ser.reset_index()['index'].iloc[idx]
     cost = 0.0
     try:
         # if we're on a holiday or weekend
         if ts.date(
         ) in self.holidays or ts.dayofweek not in self.weekmask:
             cost = calculate_tou_cost(reported_val, ts.month,
                                       ts.hour,
                                       self.demand_weekend_schedule,
                                       self.demand_rates)
         # Otherwise, it's a weekday
         cost = calculate_tou_cost(reported_val, ts.month, ts.hour,
                                   self.demand_weekday_schedule,
                                   self.demand_rates)
     except:
         pass
Пример #24
0
    def __init__(self,
                 master: pd.Series,
                 duplicates: Optional[pd.Series] = None,
                 master_id: Optional[pd.Series] = None,
                 duplicates_id: Optional[pd.Series] = None,
                 **kwargs):
        """
        StringGrouper is a class that holds the matrix with cosine similarities between the master and duplicates
        matrix. If duplicates is not given it is replaced by master. To build this matrix the `fit` function must be
        called. It is possible to add and remove matches after building with the add_match and remove_match functions

        :param master: pandas.Series. A series of strings in which similar strings are searched, either against itself
        or against the `duplicates` series.
        :param duplicates: pandas.Series. If set, for each string in duplicates a similar string is searched in Master.
        :param master_id: pandas.Series. If set, contains ID values for each row in master series.
        :param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates series.
        :param kwargs: All other keyword arguments are passed to StringGrouperConfig
        """
        # Validate match strings input
        if not StringGrouper._is_series_of_strings(master) or \
                (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)):
            raise TypeError(
                'Input does not consist of pandas.Series containing only Strings'
            )
        # Validate optional IDs input
        if not StringGrouper._is_input_data_combination_valid(
                duplicates, master_id, duplicates_id):
            raise Exception('List of data Series options is invalid')
        StringGrouper._validate_id_data(master, duplicates, master_id,
                                        duplicates_id)

        self._master: pd.Series = master.reset_index(drop=True)
        self._duplicates: pd.Series = duplicates.reset_index(
            drop=True) if duplicates is not None else None
        self._master_id: pd.Series = master_id.reset_index(
            drop=True) if master_id is not None else None
        self._duplicates_id: pd.Series = duplicates_id.reset_index(
            drop=True) if duplicates_id is not None else None
        self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
        self.is_build = False  # indicates if the grouper was fit or not
        self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
        # After the StringGrouper is build, _matches_list will contain the indices and similarities of two matches
        self._matches_list: pd.DataFrame = pd.DataFrame()
Пример #25
0
def _add_label_and_progress(
        s: pd.Series,
        pretty_models: Mapping[str, common_config.RewardCfg]) -> pd.DataFrame:
    """Add pretty label and checkpoint progress to reward distances."""
    labels = s.index.map(
        functools.partial(_pretty_label, pretty_mapping=pretty_models))
    df = s.reset_index(name="Distance")

    regex = ".*/checkpoints/(?P<Checkpoint>final|[0-9]+)(?:/.*)?$"
    match = df["source_reward_path"].str.extract(regex)
    match["Reward"] = labels

    grp = match.groupby("Reward")
    progress = grp.apply(_checkpoint_to_progress)
    progress = progress.reset_index("Reward", drop=True)
    df["Progress"] = progress
    df["Reward"] = labels

    return df
Пример #26
0
    def test_reset_index_drop_errors(self):
        #  GH 20925

        # KeyError raised for series index when passed level name is missing
        s = Series(range(4))
        with pytest.raises(KeyError, match='must be same as name'):
            s.reset_index('wrong', drop=True)
        with pytest.raises(KeyError, match='must be same as name'):
            s.reset_index('wrong')

        # KeyError raised for series when level to be dropped is missing
        s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2))
        with pytest.raises(KeyError, match='not found'):
            s.reset_index('wrong', drop=True)
Пример #27
0
    def test_reset_index_drop_errors(self):
        #  GH 20925

        # KeyError raised for series index when passed level name is missing
        s = Series(range(4))
        with pytest.raises(KeyError, match='must be same as name'):
            s.reset_index('wrong', drop=True)
        with pytest.raises(KeyError, match='must be same as name'):
            s.reset_index('wrong')

        # KeyError raised for series when level to be dropped is missing
        s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2))
        with pytest.raises(KeyError, match='not found'):
            s.reset_index('wrong', drop=True)
Пример #28
0
    def test_reset_index_drop_errors(self):
        #  GH 20925

        # KeyError raised for series index when passed level name is missing
        s = Series(range(4))
        with pytest.raises(KeyError, match="does not match index name"):
            s.reset_index("wrong", drop=True)
        with pytest.raises(KeyError, match="does not match index name"):
            s.reset_index("wrong")

        # KeyError raised for series when level to be dropped is missing
        s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2))
        with pytest.raises(KeyError, match="not found"):
            s.reset_index("wrong", drop=True)
Пример #29
0
    def test_factor_rank_autocorrelation(self, factor_values,
                                         sector_values, end_date,
                                         time_rule, by_sector,
                                         expected_vals):
        dr = date_range(start='2015-1-1', end=end_date)
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        factor_df = DataFrame(index=dr, columns=tickers, data=factor_values).stack()
        factor_df.index = factor_df.index.set_names(['date', 'asset'])

        factor = Series(factor_df)
        factor.name = 'factor'
        factor = factor.reset_index()
        factor['sector'] = sector_values
        factor = factor.set_index(['date', 'asset', 'sector']).factor

        fa = factor_rank_autocorrelation(factor, time_rule, by_sector)
        expected = Series(index=fa.index, data=expected_vals)

        assert_series_equal(fa, expected)
Пример #30
0
    def test_factor_rank_autocorrelation(self, factor_values, group_values,
                                         end_date, time_rule, by_group,
                                         expected_vals):
        dr = date_range(start='2015-1-1', end=end_date)
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        factor_df = DataFrame(index=dr, columns=tickers, data=factor_values)\
            .stack()
        factor_df.index = factor_df.index.set_names(['date', 'asset'])

        factor = Series(factor_df)
        factor.name = 'factor'
        factor = factor.reset_index()
        factor['group'] = group_values
        factor = factor.set_index(['date', 'asset', 'group']).factor

        fa = factor_rank_autocorrelation(factor, time_rule, by_group)
        expected = Series(index=fa.index, data=expected_vals)

        assert_series_equal(fa, expected)
Пример #31
0
def get_sub_loc_deaths(
    sub_location: int,
    n_draws: int,
    sub_infections_draws: pd.DataFrame,
    ifr: pd.DataFrame,
    durations: List[Dict],
    reported_deaths: pd.Series,
) -> Tuple[pd.DataFrame, pd.Series]:
    if sub_location in reported_deaths.reset_index()['location_id'].to_list():
        reported_deaths = reported_deaths.loc[sub_location]
    else:
        reported_deaths = 1

    loc_deaths = []
    loc_scalar = []
    for draw in range(n_draws):
        _ifr = ifr.loc[sub_location, draw]['ratio']
        _deaths = sub_infections_draws.loc[sub_location,
                                           f'draw_{draw}'].reset_index()
        _deaths['date'] += pd.Timedelta(
            days=durations[draw]['exposure_to_death'])
        _deaths = _deaths.set_index('date').loc[:, f'draw_{draw}']
        _deaths = (_deaths * _ifr).dropna().rename(f'draw_{draw}')
        trim_days = durations[draw]['exposure_to_death'] - durations[draw][
            'exposure_to_case']
        _deaths = _deaths[:-trim_days]
        loc_scalar.append(_deaths.sum() / reported_deaths)
        loc_deaths.append(_deaths)
    loc_deaths = pd.concat(loc_deaths, axis=1).dropna()
    loc_deaths['location_id'] = sub_location
    loc_deaths = (loc_deaths.reset_index().set_index(['location_id',
                                                      'date']).sort_index())
    loc_scalar = pd.DataFrame({
        'draw': list(range(n_draws)),
        'location_id': sub_location,
        'em_scalar': loc_scalar,
    })
    loc_scalar = (loc_scalar.set_index(['draw', 'location_id'
                                        ]).sort_index().loc[:, 'em_scalar'])

    return loc_deaths, loc_scalar
Пример #32
0
 def test_merge_multiple_cols_with_mixed_cols_index(self):
     # GH29522
     s = Series(
         range(6),
         MultiIndex.from_product([["A", "B"], [1, 2, 3]],
                                 names=["lev1", "lev2"]),
         name="Amount",
     )
     df = DataFrame({
         "lev1": list("AAABBB"),
         "lev2": [1, 2, 3, 1, 2, 3],
         "col": 0
     })
     result = merge(df, s.reset_index(), on=["lev1", "lev2"])
     expected = DataFrame({
         "lev1": list("AAABBB"),
         "lev2": [1, 2, 3, 1, 2, 3],
         "col": [0] * 6,
         "Amount": range(6),
     })
     tm.assert_frame_equal(result, expected)
Пример #33
0
def group_rep_transform(method: str, weights: pd.Series, grouped_data,
                        group_col, record_id_col,
                        record_name_col) -> Union[pd.Series, pd.DataFrame]:
    stashed_index = grouped_data.index
    group_of_master_id = get_column(group_col,
                                    grouped_data).reset_index(drop=True)
    group_of_master_id = group_of_master_id.rename(
        'raw_group_id').reset_index().rename(columns={'index': 'weight'})
    group_of_master_id['weight'] = weights.reset_index(drop=True)
    group_of_master_id['group_rep'] = \
        group_of_master_id.groupby('raw_group_id', sort=False)['weight'].transform(method)
    record_id_col = get_column(record_id_col, grouped_data)
    new_rep = record_id_col.iloc[group_of_master_id.group_rep].reset_index(
        drop=True).rename(None)
    if record_name_col is None:
        output = new_rep
    else:
        record_name_col = get_column(record_name_col, grouped_data)
        new_rep_name = record_name_col.iloc[
            group_of_master_id.group_rep].reset_index(drop=True).rename(None)
        output = pd.concat([new_rep, new_rep_name], axis=1)
    output.index = stashed_index
    return output
Пример #34
0
    def distribution_table(
        self,
        data: Optional[Series] = None,
        top: int = 25,
    ) -> DataFrame:
        """
        Return a table of the top words found in answers given to the Question.

        :param data: Optional Series containing response texts.
        :param top: Number of words to return counts for.
        """
        data = data if data is not None else self._data
        if data is None:
            raise ValueError('No data!')
        words = pre_process_text_series(data)
        value_counts = Series(words).value_counts()[:top].rename('Count')
        value_counts.index.name = 'Word'
        word_counts = (
            value_counts.reset_index().sort_values('Word').sort_values(
                'Count', ascending=False).reset_index())
        word_counts = word_counts.sort_values(
            ['Count', 'Word'],
            ascending=[False, True]).reset_index()[['Word', 'Count']]
        return word_counts
Пример #35
0
def variants_vaccines(rate_age_pattern: pd.Series,
                      denom_age_pattern: pd.Series,
                      age_spec_population: pd.Series,
                      rate: pd.Series,
                      day_shift: int,
                      escape_variant_prevalence: pd.Series,
                      severity_variant_prevalence: pd.Series,
                      vaccine_coverage: pd.DataFrame,
                      population: pd.Series,
                      variant_risk_ratio: float,
                      verbose: bool = True,):
    escape_variant_prevalence = escape_variant_prevalence.reset_index()
    escape_variant_prevalence['date'] += pd.Timedelta(days=day_shift)
    escape_variant_prevalence = (escape_variant_prevalence
                                 .set_index(['location_id', 'date'])
                                 .loc[:, 'escape_variant_prevalence'])
    escape_variant_prevalence = pd.concat([rate, escape_variant_prevalence], axis=1)  # borrow axis
    escape_variant_prevalence = escape_variant_prevalence['escape_variant_prevalence'].fillna(0)
    
    severity_variant_prevalence = severity_variant_prevalence.reset_index()
    severity_variant_prevalence['date'] += pd.Timedelta(days=day_shift)
    severity_variant_prevalence = (severity_variant_prevalence
                                 .set_index(['location_id', 'date'])
                                 .loc[:, 'severity_variant_prevalence'])
    severity_variant_prevalence = pd.concat([rate, severity_variant_prevalence], axis=1)  # borrow axis
    severity_variant_prevalence = severity_variant_prevalence['severity_variant_prevalence'].fillna(0)

    lr_e = [f'cumulative_lr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    lr_ep = [f'cumulative_lr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    hr_e = [f'cumulative_hr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    hr_ep = [f'cumulative_hr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    vaccine_coverage = (vaccine_coverage
                        .loc[:, lr_e + lr_ep + hr_e + hr_ep]
                        .reset_index())
    vaccine_coverage['date'] += pd.Timedelta(days=day_shift)
    vaccine_coverage = vaccine_coverage.set_index(['location_id', 'date'])
    vaccine_coverage = pd.concat([rate.rename('rate'), vaccine_coverage], axis=1)  # borrow axis
    del vaccine_coverage['rate']
    vaccine_coverage = vaccine_coverage.fillna(0)
    
    # not super necessary...
    numerator = pd.Series(100, index=rate.index)
    numerator /= population
    
    denominator_a = (numerator / rate)
    denominator_ev = (numerator / (rate * variant_risk_ratio))
    denominator_sv = denominator_ev.copy()
    denominator_a *= (1 - (escape_variant_prevalence + severity_variant_prevalence)[denominator_a.index])
    denominator_ev *= escape_variant_prevalence[denominator_ev.index]
    denominator_sv *= severity_variant_prevalence[denominator_sv.index]

    numerator_a = (rate * denominator_a)
    numerator_ev = (rate * variant_risk_ratio * denominator_ev)
    numerator_sv = (rate * variant_risk_ratio * denominator_sv)
    
    if verbose:
        logger.info('Adjusting ancestral...')
    numerator_lr_a, numerator_hr_a, denominator_lr_a, denominator_hr_a = adjust_by_variant_classification(
        numerator=numerator_a,
        denominator=denominator_a,
        variant_suffixes=['wildtype', 'variant',],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    if verbose:
        logger.info('Adjusting non-escape...')
    numerator_lr_sv, numerator_hr_sv, denominator_lr_sv, denominator_hr_sv = adjust_by_variant_classification(
        numerator=numerator_sv,
        denominator=denominator_sv,
        variant_suffixes=['wildtype', 'variant'],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    if verbose:
        logger.info('Adjusting escape...')
    numerator_lr_ev, numerator_hr_ev, denominator_lr_ev, denominator_hr_ev = adjust_by_variant_classification(
        numerator=numerator_ev,
        denominator=denominator_ev,
        variant_suffixes=['variant',],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    
    numerator_lr = numerator_lr_a + numerator_lr_ev + numerator_lr_sv
    denominator_lr = denominator_lr_a + denominator_lr_ev + denominator_lr_sv
    numerator_hr = numerator_hr_a + numerator_hr_ev + numerator_hr_sv
    denominator_hr = denominator_hr_a + denominator_hr_ev + denominator_hr_sv
    
    rate = (numerator_lr + numerator_hr) / (denominator_lr + denominator_hr)
    rate_lr = numerator_lr / denominator_lr
    rate_hr = numerator_hr / denominator_hr
    
    pct_inf_lr = denominator_lr / (denominator_lr + denominator_hr)
    pct_inf_hr = denominator_hr / (denominator_lr + denominator_hr)
    
    return rate, rate_lr, rate_hr, pct_inf_lr, pct_inf_hr
Пример #36
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({
            'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
            'B':
            np.random.randn(1000)
        })

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'],
                        errors="raise")).tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(
            np.array([
                pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')
            ],
                     dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame([{
            'ts': datetime(2014, 4, 1, tzinfo=pytz.utc),
            'foo': 1
        }])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011',
                                periods=5,
                                freq='D',
                                tz=tz,
                                name='idx')
            df = pd.DataFrame({
                'a': range(5),
                'b': ['A', 'B', 'C', 'D', 'E']
            },
                              index=idx)

            expected = pd.DataFrame(
                {
                    'idx': [
                        datetime(2011, 1, 1),
                        datetime(2011, 1, 2),
                        datetime(2011, 1, 3),
                        datetime(2011, 1, 4),
                        datetime(2011, 1, 5)
                    ],
                    'a':
                    range(5),
                    'b': ['A', 'B', 'C', 'D', 'E']
                },
                columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Пример #37
0
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             verbosity=2,
             **kwargs):
        # try_import_mxnet()
        try_import_autogluon_vision()
        from autogluon.vision import ImagePredictor
        params = self._get_model_params()

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "\tsample_weight not yet supported for ImagePredictorModel, this model will ignore them in training."
            )

        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        if X_val is not None:
            X_val = X_val.reset_index(drop=True)
            y_val = y_val.reset_index(drop=True)
        X[self._label_column_name] = y
        if X_val is not None:
            X_val[self._label_column_name] = y_val

        null_indices = X['image'] == ''

        # TODO: Consider some kind of weighting of the two options so there isn't a harsh cutoff at 50
        # FIXME: What if all rows in a class are null? Will probably crash.
        if null_indices.sum() > 50:
            self._dummy_pred_proba = self._compute_dummy_pred_proba(
                y[null_indices])  # FIXME: Do this one for better results
        else:
            # Not enough null to get a confident estimate of null label average, instead use all data average
            self._dummy_pred_proba = self._compute_dummy_pred_proba(y)

        if null_indices.sum() > 0:
            X = X[~null_indices]
        if X_val is not None:
            null_indices_val = X_val['image'] == ''
            if null_indices_val.sum() > 0:
                X_val = X_val[~null_indices_val]

        verbosity_image = max(0, verbosity - 1)
        # TODO: ImagePredictor doesn't use problem_type in any way at present.
        #  It also doesn't error or warn if problem_type is not one it expects.
        self.model = ImagePredictor(
            problem_type=self.problem_type,
            path=self.path,
            # eval_metric=self.eval_metric,  # TODO: multiclass/binary vision problem works only with accuracy, regression with rmse
            verbosity=verbosity_image)

        logger.log(15, f'\tHyperparameters: {params}')

        # FIXME: ImagePredictor crashes if given float time_limit
        if time_limit is not None:
            time_limit = int(time_limit)

        self.model.fit(train_data=X,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       hyperparameters=params,
                       random_state=0)
Пример #38
0
def window_agg_udf(
    grouped_data: SeriesGroupBy,
    function: Callable,
    window_lower_indices: pd.Series,
    window_upper_indices: pd.Series,
    mask: pd.Series,
    result_index: pd.Index,
    dtype: np.dtype,
    max_lookback: int,
    *args: Tuple[Any],
    **kwargs: Dict[str, Any],
) -> pd.Series:
    """Apply window aggregation with UDFs.

    Notes:
    Use custom logic to computing rolling window UDF instead of
    using pandas's rolling function.
    This is because pandas's rolling function doesn't support
    multi param UDFs.
    """
    assert len(window_lower_indices) == len(window_upper_indices)
    assert len(window_lower_indices) == len(mask)

    # Reset index here so we don't need to deal with mismatching
    # indices
    window_lower_indices = window_lower_indices.reset_index(drop=True)
    window_upper_indices = window_upper_indices.reset_index(drop=True)
    mask = mask.reset_index(drop=True)

    # Compute window indices and manually roll
    # over the window.

    # If an window has only nan values, we output nan for
    # the window result. This follows pandas rolling apply
    # behavior.

    # The first input column is in grouped_data, but there may
    # be additional input columns in args.
    inputs = (grouped_data,) + args

    masked_window_lower_indices = window_lower_indices[mask].astype('i8')
    masked_window_upper_indices = window_upper_indices[mask].astype('i8')

    input_iters = list(
        create_window_input_iter(
            arg, masked_window_lower_indices, masked_window_upper_indices
        )
        if isinstance(arg, (pd.Series, SeriesGroupBy))
        else itertools.repeat(arg)
        for arg in inputs
    )

    valid_result = pd.Series(
        function(*(next(gen) for gen in input_iters))
        for i in range(len(masked_window_lower_indices))
    )

    valid_result = pd.Series(valid_result)
    valid_result.index = masked_window_lower_indices.index
    result = pd.Series(index=mask.index, dtype=dtype)
    result[mask] = valid_result
    result.index = result_index

    return result
Пример #39
0
 def test_reset_index_name(self):
     s = Series([1, 2, 3], index=Index(range(3), name="x"))
     assert s.reset_index().index.name is None
     assert s.reset_index(drop=True).index.name is None
Пример #40
0
 def test_reset_index_name(self):
     s = Series([1, 2, 3], index=Index(range(3), name='x'))
     assert s.reset_index().index.name is None
     assert s.reset_index(drop=True).index.name is None
Пример #41
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011', periods=5,
                                freq='D', tz=tz, name='idx')
            df = pd.DataFrame(
                {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)

            expected = pd.DataFrame({'idx': [datetime(2011, 1, 1),
                                             datetime(2011, 1, 2),
                                             datetime(2011, 1, 3),
                                             datetime(2011, 1, 4),
                                             datetime(2011, 1, 5)],
                                     'a': range(5),
                                     'b': ['A', 'B', 'C', 'D', 'E']},
                                    columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)