Пример #1
0
    def test_fillna_consistency(self):
        # GH 16402
        # fillna with a tz aware to a tz-naive, should result in object

        s = Series([Timestamp('20130101'), pd.NaT])

        result = s.fillna(Timestamp('20130101', tz='US/Eastern'))
        expected = Series([Timestamp('20130101'),
                           Timestamp('2013-01-01', tz='US/Eastern')],
                          dtype='object')
        assert_series_equal(result, expected)

        # where (we ignore the errors=)
        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        # with a non-datetime
        result = s.fillna('foo')
        expected = Series([Timestamp('20130101'),
                           'foo'])
        assert_series_equal(result, expected)

        # assignment
        s2 = s.copy()
        s2[1] = 'foo'
        assert_series_equal(s2, expected)
Пример #2
0
    def test_pad_nan(self):
        x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float)

        x.fillna(method="pad", inplace=True)

        expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float)
        assert_series_equal(x[1:], expected[1:])
        self.assertTrue(np.isnan(x[0]), np.isnan(expected[0]))
Пример #3
0
    def test_fillna_bug(self):
        x = Series([nan, 1., nan, 3., nan],['z','a','b','c','d'])
        filled = x.fillna(method='ffill')
        expected = Series([nan, 1., 1., 3., 3.], x.index)
        assert_series_equal(filled, expected)

        filled = x.fillna(method='bfill')
        expected = Series([1., 1., 3., 3., nan], x.index)
        assert_series_equal(filled, expected)
Пример #4
0
    def test_fillna_bug(self):
        x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"])
        filled = x.fillna(method="ffill")
        expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index)
        assert_series_equal(filled, expected)

        filled = x.fillna(method="bfill")
        expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index)
        assert_series_equal(filled, expected)
Пример #5
0
    def test_pad_nan(self):
        x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'],
                   dtype=float)

        x.fillna(method='pad', inplace=True)

        expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0],
                          ['z', 'a', 'b', 'c', 'd'], dtype=float)
        assert_series_equal(x[1:], expected[1:])
        self.assertTrue(np.isnan(x[0]), np.isnan(expected[0]))
Пример #6
0
def pd_03():
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill',limit=2)
    data=Series([1.,None,3.5,None,7])
    print data.fillna(data.mean())
    print df.fillna(df.mean())
Пример #7
0
    def test_fillna_raise(self):
        s = Series(np.random.randint(-100, 100, 50))
        pytest.raises(TypeError, s.fillna, [1, 2])
        pytest.raises(TypeError, s.fillna, (1, 2))

        # related GH 9217, make sure limit is an int and greater than 0
        s = Series([1, 2, 3, None])
        for limit in [-1, 0, 1., 2.]:
            for method in ['backfill', 'bfill', 'pad', 'ffill', None]:
                with pytest.raises(ValueError):
                    s.fillna(1, limit=limit, method=method)
Пример #8
0
    def test_fillna(self):
        ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))

        self.assert_(np.array_equal(ts, ts.fillna()))

        ts[2] = np.NaN

        self.assert_(np.array_equal(ts.fillna(), [0., 1., 1., 3., 4.]))
        self.assert_(np.array_equal(ts.fillna(method='backfill'),
                                    [0., 1., 3., 3., 4.]))

        self.assert_(np.array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.]))
Пример #9
0
    def test_fillna(self):
        # GH 13737
        s = Series([pd.Period('2011-01', freq='M'),
                    pd.Period('NaT', freq='M')])

        res = s.fillna(pd.Period('2012-01', freq='M'))
        exp = Series([pd.Period('2011-01', freq='M'),
                      pd.Period('2012-01', freq='M')])
        tm.assert_series_equal(res, exp)
        assert res.dtype == 'object'

        res = s.fillna('XXX')
        exp = Series([pd.Period('2011-01', freq='M'), 'XXX'])
        tm.assert_series_equal(res, exp)
        assert res.dtype == 'object'
Пример #10
0
 def test_fillna_categorical(self, fill_value, expected_output):
     # GH 17033
     # Test fillna for a Categorical series
     data = ['a', np.nan, 'b', np.nan, np.nan]
     s = Series(Categorical(data, categories=['a', 'b']))
     exp = Series(Categorical(expected_output, categories=['a', 'b']))
     tm.assert_series_equal(s.fillna(fill_value), exp)
Пример #11
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        s = Series([0, 1, np.nan, 3, 4, 5])

        exp = s.fillna(0).add(2)
        res = s.add(2, fill_value=0)
        assert_series_equal(res, exp)
Пример #12
0
def sliding_freq(dictionary, data_raw, window, key_word):
    '''
    takes a window size in ms and does non-overlapping count
    (which is freq, since the windows are all the same size) and average ibi.
    key_word is the key used to access the section in each dictionary value
    over which the count should occur.
    returns two dataframes with this information for each 'test' case.
    '''
    import math
    
    num = math.trunc(data_raw.index[-1]/window) #get the number of windows to do the counts and averages over.
    
    sliding_count = DataFrame(index= (np.arange(num)*window)) #where we're storing the results for each timeseries
    sliding_mean = DataFrame(index= (np.arange(num)*window)) #where we're storing the results for each timeseries
    
    for key, value in dictionary.iteritems():
        
        temp_count = Series(index=(np.arange(num)*window)) #temp storage
        temp_mean = Series(index=(np.arange(num)*window)) #temp storage
        
        for i in (np.arange(num)*window):
            temp_count[i] = value[key_word][i:(i+window)].count() #get the count in the window
            temp_mean[i] = value[key_word][i:(i+window)].mean() #get the mean in the window
        
        temp_mean = temp_mean.fillna(0) #temp mean returns NaN for windows with no events. make it zero for graphing
        
        sliding_count[key] = temp_count #store series in results table
        sliding_mean[key] = temp_mean #store series in results table
    
    sliding_count = sliding_count.sort_index(axis = 1)
    sliding_mean = sliding_mean.sort_index(axis = 1) #my attempt at reordering so the columns are in increaing order
    return sliding_mean, sliding_count
Пример #13
0
    def test_endswith(self):
        values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"])

        result = values.str.endswith("foo")
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]
        rs = strings.str_endswith(mixed, "f")
        xp = [False, NA, False, NA, NA, False, NA, NA, NA]
        tm.assert_almost_equal(rs, xp)

        rs = Series(mixed).str.endswith("f")
        self.assert_(isinstance(rs, Series))
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u"om", NA, u"foo_nom", u"nom", u"bar_foo", NA, u"foo"])

        result = values.str.endswith("foo")
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        result = values.str.endswith("foo", na=False)
        tm.assert_series_equal(result, exp.fillna(False).astype(bool))
Пример #14
0
    def test_endswith(self):
        values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])

        result = values.str.endswith('foo')
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
        rs = strings.str_endswith(mixed, 'f')
        xp = [False, NA, False, NA, NA, False, NA, NA, NA]
        tm.assert_almost_equal(rs, xp)

        rs = Series(mixed).str.endswith('f')
        tm.assert_isinstance(rs, Series)
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
                         u('foo')])

        result = values.str.endswith('foo')
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        result = values.str.endswith('foo', na=False)
        tm.assert_series_equal(result, exp.fillna(False).astype(bool))
Пример #15
0
    def test_fillna_inplace(self):
        x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"])
        y = x.copy()

        y.fillna(value=0, inplace=True)

        expected = x.fillna(value=0)
        assert_series_equal(y, expected)
Пример #16
0
def count_enf_born(info_child, index):
    info_child['enf_born'] =  ( info_child['age_enf'] >= 0 )*info_child['nb_enf']
    info = info_child.groupby(['id_parent'])['enf_born'].sum().reset_index()
    info.columns = ['id_parent', 'nb_born']
    info.index = info['id_parent']
    nb_born= Series(zeros(len(index)), index=index)
    nb_born += info['nb_born']
    return nb_born.fillna(0)
Пример #17
0
    def test_fillna_inplace(self):
        x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd'])
        y = x.copy()

        y.fillna(value=0, inplace=True)

        expected = x.fillna(value=0)
        assert_series_equal(y, expected)
Пример #18
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype='M8[ns]')

        filled = series.fillna(method='pad')
        filled2 = series.fillna(value=series.values[2])

        expected = series.copy()
        expected.values[3] = expected.values[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({'A': series})
        filled = df.fillna(method='pad')
        filled2 = df.fillna(value=series.values[2])
        expected = DataFrame({'A': expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Пример #19
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype="M8[us]")

        filled = series.fillna(method="pad")
        filled2 = series.fillna(value=series[2])

        expected = series.copy()
        expected[3] = expected[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="pad")
        filled2 = df.fillna(value=series[2])
        expected = DataFrame({"A": expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Пример #20
0
    def test_datetime64_fillna(self):

        s = Series([Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01")])
        s[2] = np.nan

        # reg fillna
        result = s.fillna(Timestamp("20130104"))
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130104"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        from pandas import tslib

        result = s.fillna(tslib.NaT)
        expected = s
        assert_series_equal(result, expected)

        # ffill
        result = s.ffill()
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        # bfill
        result = s.bfill()
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        # GH 6587
        # make sure that we are treating as integer when filling
        # this also tests inference of a datetime-like with NaT's
        s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"])
        expected = Series(
            ["2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001"], dtype="M8[ns]"
        )
        result = s.fillna(method="backfill")
        assert_series_equal(result, expected)
Пример #21
0
    def test_bool_operators_with_nas(self, bool_op):
        # boolean &, |, ^ should work with object arrays and propagate NAs
        ser = Series(bdate_range('1/1/2000', periods=10), dtype=object)
        ser[::2] = np.nan

        mask = ser.isna()
        filled = ser.fillna(ser[0])

        result = bool_op(ser < ser[9], ser > ser[3])

        expected = bool_op(filled < filled[9], filled > filled[3])
        expected[mask] = False
        assert_series_equal(result, expected)
Пример #22
0
    def test_datetime64_fillna(self):

        s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp(
            '20130102'), Timestamp('20130103 9:01:01')])
        s[2] = np.nan

        # reg fillna
        result = s.fillna(Timestamp('20130104'))
        expected = Series([Timestamp('20130101'), Timestamp(
            '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')])
        assert_series_equal(result, expected)

        result = s.fillna(NaT)
        expected = s
        assert_series_equal(result, expected)

        # ffill
        result = s.ffill()
        expected = Series([Timestamp('20130101'), Timestamp(
            '20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01')])
        assert_series_equal(result, expected)

        # bfill
        result = s.bfill()
        expected = Series([Timestamp('20130101'), Timestamp('20130101'),
                           Timestamp('20130103 9:01:01'), Timestamp(
                               '20130103 9:01:01')])
        assert_series_equal(result, expected)

        # GH 6587
        # make sure that we are treating as integer when filling
        # this also tests inference of a datetime-like with NaT's
        s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001'])
        expected = Series(
            ['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001',
             '2013-08-05 15:30:00.000001'], dtype='M8[ns]')
        result = s.fillna(method='backfill')
        assert_series_equal(result, expected)
Пример #23
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan # not same

    # Exclude N/A
    print '',''
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ])
    cleaned = data.dropna() # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how='all')
    data[4] = None
    print data.dropna(axis=1, how='all')
    print data.dropna(thresh=2) # non NA is more 2

    # Fill NA
    print '',''
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print '',''
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill', limit=2)
    data = Series([1., NA, 3.5, NA, 7])
    print data.fillna(data.mean())
Пример #24
0
    def test_comparison_operators_with_nas(self):
        s = Series(bdate_range('1/1/2000', periods=10), dtype=object)
        s[::2] = np.nan

        # test that comparisons work
        ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne']
        for op in ops:
            val = s[5]

            f = getattr(operator, op)
            result = f(s, val)

            expected = f(s.dropna(), val).reindex(s.index)

            if op == 'ne':
                expected = expected.fillna(True).astype(bool)
            else:
                expected = expected.fillna(False).astype(bool)

            assert_series_equal(result, expected)

            # fffffffuuuuuuuuuuuu
            # result = f(val, s)
            # expected = f(val, s.dropna()).reindex(s.index)
            # assert_series_equal(result, expected)

            # boolean &, |, ^ should work with object arrays and propagate NAs

        ops = ['and_', 'or_', 'xor']
        mask = s.isnull()
        for bool_op in ops:
            f = getattr(operator, bool_op)

            filled = s.fillna(s[0])

            result = f(s < s[9], s > s[3])

            expected = f(filled < filled[9], filled > filled[3])
            expected[mask] = False
            assert_series_equal(result, expected)
Пример #25
0
def test_timedelta_assignment():
    # GH 8209
    s = Series([], dtype=object)
    s.loc["B"] = timedelta(1)
    tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"]))

    s = s.reindex(s.index.insert(0, "A"))
    tm.assert_series_equal(
        s, Series([np.nan, Timedelta("1 days")], index=["A", "B"]))

    result = s.fillna(timedelta(1))
    expected = Series(Timedelta("1 days"), index=["A", "B"])
    tm.assert_series_equal(result, expected)

    s.loc["A"] = timedelta(1)
    tm.assert_series_equal(s, expected)

    # GH 14155
    s = Series(10 * [np.timedelta64(10, "m")])
    s.loc[[1, 2, 3]] = np.timedelta64(20, "m")
    expected = pd.Series(10 * [np.timedelta64(10, "m")])
    expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m"))
    tm.assert_series_equal(s, expected)
Пример #26
0
def test_timedelta_assignment():
    # GH 8209
    s = Series([])
    s.loc['B'] = timedelta(1)
    tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B']))

    s = s.reindex(s.index.insert(0, 'A'))
    tm.assert_series_equal(
        s, Series([np.nan, Timedelta('1 days')], index=['A', 'B']))

    result = s.fillna(timedelta(1))
    expected = Series(Timedelta('1 days'), index=['A', 'B'])
    tm.assert_series_equal(result, expected)

    s.loc['A'] = timedelta(1)
    tm.assert_series_equal(s, expected)

    # GH 14155
    s = Series(10 * [np.timedelta64(10, 'm')])
    s.loc[[1, 2, 3]] = np.timedelta64(20, 'm')
    expected = pd.Series(10 * [np.timedelta64(10, 'm')])
    expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm'))
    tm.assert_series_equal(s, expected)
Пример #27
0
def test_timedelta_assignment():
    # GH 8209
    s = Series([])
    s.loc['B'] = timedelta(1)
    tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B']))

    s = s.reindex(s.index.insert(0, 'A'))
    tm.assert_series_equal(s, Series(
        [np.nan, Timedelta('1 days')], index=['A', 'B']))

    result = s.fillna(timedelta(1))
    expected = Series(Timedelta('1 days'), index=['A', 'B'])
    tm.assert_series_equal(result, expected)

    s.loc['A'] = timedelta(1)
    tm.assert_series_equal(s, expected)

    # GH 14155
    s = Series(10 * [np.timedelta64(10, 'm')])
    s.loc[[1, 2, 3]] = np.timedelta64(20, 'm')
    expected = pd.Series(10 * [np.timedelta64(10, 'm')])
    expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm'))
    tm.assert_series_equal(s, expected)
Пример #28
0
 def _normalize_names(self, names: pd.Series) -> pd.Series:
     """Take names and run a normalization routine"""
     # Make a transalation table of unwanted characers
     unwanted_characters = (
         string.digits +
         string.punctuation +
         string.whitespace
     )
     # Remove unwanted characters efficiently
     translation_table =  str.maketrans('', '', unwanted_characters)
     # Run our string operations (remember NAN is a valid name)
     output = (
         names.fillna('')
              .astype(str)
              .str.translate(translation_table)
              .str.upper()
              .str.replace(r'\s?J\.*?R\.*\s*?$', '', regex=True)
              .str.replace(r'\s?S\.*?R\.*\s*?$', '', regex=True)
              .str.replace(r'\s?III\s*?$',      '', regex=True)
              .str.replace(r'\s?IV\s*?$',       '', regex=True)
     )
     output.name = 'name'
     return output
Пример #29
0
    def test_fillna_float_casting(self, dtype, fill_type, scalar):
        # GH-43424
        ser = Series([np.nan, 1.2], dtype=dtype)
        fill_values = Series([2, 2], dtype=fill_type)
        if scalar:
            fill_values = fill_values.dtype.type(2)

        result = ser.fillna(fill_values)
        expected = Series([2.0, 1.2], dtype=dtype)
        tm.assert_series_equal(result, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        mask = ser.isna().to_numpy()
        ser[mask] = fill_values
        tm.assert_series_equal(ser, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        ser.mask(mask, fill_values, inplace=True)
        tm.assert_series_equal(ser, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        res = ser.where(~mask, fill_values)
        tm.assert_series_equal(res, expected)
Пример #30
0
def binned_sampling(values: pd.Series, feature_list: List[str], ctrl_size: int,
                    n_bins: int, rand_seed: int) -> List[str]:
    """
    Score a set of genes [Satija15]_.
    The score is the average expression of a set of genes subtracted with the
    average expression of a reference set of genes. The reference set is
    randomly sampled from the `gene_pool` for each binned expression value.

    This reproduces the approach in Seurat [Satija15]_ and has been implemented
    for Scanpy by Davide Cittaro.

    This function is adapted from Scanpy's `score_genes`.

    Args:
        values: The values for the features.
        feature_list: The list of features to use for score calculation.
        ctrl_size: Number of reference features to be sampled from each bin.
        n_bins: Number of bins for sampling.
        rand_seed: The seed to use for the random number generation.

    Returns:
        A list of sampled features.
    """
    n_items = int(np.round(len(values) / (n_bins - 1)))
    feature_list = set(feature_list)
    # Made following more linter friendly
    # obs_cut = obs_avg.rank(method='min') // n_items
    obs_cut: pd.Series = values.fillna(0).rank(
        method='min').divide(n_items).astype(int)

    control_genes = set()
    for cut in np.unique(obs_cut[feature_list]):
        # Replaced np.random.shuffle with pandas' sample method
        r_genes = obs_cut[obs_cut == cut].sample(n=ctrl_size,
                                                 random_state=rand_seed).index
        control_genes.update(set(r_genes))
    return list(control_genes - feature_list)
Пример #31
0
    def _prepare_data_dtypes(self, series: pd.Series) -> pd.Series:
        """
        Подготовка данных для передачи данных на вход encoder'a:
            - замена пропусков на fill_value;
            - преобразованеи столбца значений в object-столбец.

        Parameters:
        -----------
        series: pandas.Series
            Вектор наблюдений.

        Returns:
        --------
        series_prepared: pandas.Series
            Преобразованный вектор наблюдений.
        """
        try:
            if series.dtype == 'category':
                series = series.cat.add_categories(self.fill_value)
        except TypeError:
            pass
        series_prepared = series.fillna(self.fill_value)
        series_prepared = series_prepared.astype("str")
        return series_prepared
Пример #32
0
    def test_fillna_raise(self):
        s = Series(np.random.randint(-100, 100, 50))
        msg = '"value" parameter must be a scalar or dict, but you passed a "list"'
        with pytest.raises(TypeError, match=msg):
            s.fillna([1, 2])

        msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"'
        with pytest.raises(TypeError, match=msg):
            s.fillna((1, 2))

        # related GH 9217, make sure limit is an int and greater than 0
        s = Series([1, 2, 3, None])
        msg = (r"Cannot specify both 'value' and 'method'\.|"
               r"Limit must be greater than 0|"
               "Limit must be an integer")
        for limit in [-1, 0, 1.0, 2.0]:
            for method in ["backfill", "bfill", "pad", "ffill", None]:
                with pytest.raises(ValueError, match=msg):
                    s.fillna(1, limit=limit, method=method)
Пример #33
0
    def test_fillna(self):
        ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))

        tm.assert_series_equal(ts, ts.fillna(method="ffill"))

        ts[2] = np.NaN

        exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="ffill"), exp)

        exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="backfill"), exp)

        exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(value=5), exp)

        msg = "Must specify a fill 'value' or 'method'"
        with pytest.raises(ValueError, match=msg):
            ts.fillna()
Пример #34
0
    def test_fillna(self):
        ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))

        self.assert_numpy_array_equal(ts, ts.fillna(method='ffill'))

        ts[2] = np.NaN

        self.assert_numpy_array_equal(ts.fillna(method='ffill'),
                                      [0., 1., 1., 3., 4.])
        self.assert_numpy_array_equal(ts.fillna(method='backfill'),
                                      [0., 1., 3., 3., 4.])

        self.assert_numpy_array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.])

        self.assertRaises(ValueError, ts.fillna)
        self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill')

        # GH 5703
        s1 = Series([np.nan])
        s2 = Series([1])
        result = s1.fillna(s2)
        expected = Series([1.])
        assert_series_equal(result, expected)
        result = s1.fillna({})
        assert_series_equal(result, s1)
        result = s1.fillna(Series(()))
        assert_series_equal(result, s1)
        result = s2.fillna(s1)
        assert_series_equal(result, s2)
        result = s1.fillna({0: 1})
        assert_series_equal(result, expected)
        result = s1.fillna({1: 1})
        assert_series_equal(result, Series([np.nan]))
        result = s1.fillna({0: 1, 1: 1})
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}))
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
        assert_series_equal(result, s1)

        s1 = Series([0, 1, 2], list('abc'))
        s2 = Series([0, np.nan, 2], list('bac'))
        result = s2.fillna(s1)
        expected = Series([0, 0, 2.], list('bac'))
        assert_series_equal(result, expected)

        # limit
        s = Series(np.nan, index=[0, 1, 2])
        result = s.fillna(999, limit=1)
        expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        result = s.fillna(999, limit=2)
        expected = Series([999, 999, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        # GH 9043
        # make sure a string representation of int/float values can be filled
        # correctly without raising errors or being converted
        vals = ['0', '1.5', '-0.3']
        for val in vals:
            s = Series([0, 1, np.nan, np.nan, 4], dtype='float64')
            result = s.fillna(val)
            expected = Series([0, 1, val, val, 4], dtype='object')
            assert_series_equal(result, expected)
Пример #35
0
 def test_bfill(self):
     ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))
     ts[2] = np.NaN
     assert_series_equal(ts.bfill(), ts.fillna(method='bfill'))
Пример #36
0
 def test_fillna_int(self):
     s = Series(np.random.randint(-100, 100, 50))
     s.fillna(method='ffill', inplace=True)
     assert_series_equal(s.fillna(method='ffill', inplace=False), s)
Пример #37
0
def ebsw(close, length=None, bars=None, offset=None, **kwargs):
    """Indicator: Even Better SineWave (EBSW)"""
    # Validate arguments
    length = int(length) if length and length > 38 else 40
    bars = int(bars) if bars and bars > 0 else 10
    close = verify_series(close, length)
    offset = get_offset(offset)

    if close is None: return

    # variables
    alpha1 = HP = 0  # alpha and HighPass
    a1 = b1 = c1 = c2 = c3 = 0
    Filt = Pwr = Wave = 0

    lastClose = lastHP = 0
    FilterHist = [0, 0]  # Filter history

    # Calculate Result
    m = close.size
    result = [npNaN for _ in range(0, length - 1)] + [0]
    for i in range(length, m):
        # HighPass filter cyclic components whose periods are shorter than Duration input
        alpha1 = (1 - npSin(360 / length)) / npCos(360 / length)
        HP = 0.5 * (1 + alpha1) * (close[i] - lastClose) + alpha1 * lastHP

        # Smooth with a Super Smoother Filter from equation 3-3
        a1 = npExp(-npSqrt(2) * npPi / bars)
        b1 = 2 * a1 * npCos(npSqrt(2) * 180 / bars)
        c2 = b1
        c3 = -1 * a1 * a1
        c1 = 1 - c2 - c3
        Filt = c1 * (HP + lastHP) / 2 + c2 * FilterHist[1] + c3 * FilterHist[0]
        # Filt = float("{:.8f}".format(float(Filt))) # to fix for small scientific notations, the big ones fail

        # 3 Bar average of Wave amplitude and power
        Wave = (Filt + FilterHist[1] + FilterHist[0]) / 3
        Pwr = (Filt * Filt + FilterHist[1] * FilterHist[1] +
               FilterHist[0] * FilterHist[0]) / 3

        # Normalize the Average Wave to Square Root of the Average Power
        Wave = Wave / npSqrt(Pwr)

        # update storage, result
        FilterHist.append(Filt)  # append new Filt value
        FilterHist.pop(
            0)  # remove first element of list (left) -> updating/trim
        lastHP = HP
        lastClose = close[i]
        result.append(Wave)

    ebsw = Series(result, index=close.index)

    # Offset
    if offset != 0:
        ebsw = ebsw.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        ebsw.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        ebsw.fillna(method=kwargs["fill_method"], inplace=True)

    # Name and Categorize it
    ebsw.name = f"EBSW_{length}_{bars}"
    ebsw.category = "cycles"

    return ebsw
Пример #38
0
df.fillna(0)
df.fillna({1: 0.5, 3: -1})
df.fillna({1: 0.5, 2: -1})

df.fillna(0, inplace=True)
df

df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

### Hierarchical Indexing -- adding higher dimensionality -- no examples

### read_csv et al
df = pd.read_csv('ex1.csv')
df
pd.read_table('ex1.csv', sep=',')

pd.read_csv('ex2.csv', header=None)
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message')

result = pd.read_table('ex3.txt', sep='\s+')
Пример #39
0
 def test_fillna_int(self):
     ser = Series(np.random.randint(-100, 100, 50))
     return_value = ser.fillna(method="ffill", inplace=True)
     assert return_value is None
     tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser)
Пример #40
0
 def test_bfill(self):
     ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))
     ts[2] = np.NaN
     tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill"))
Пример #41
0
    def test_fillna(self, datetime_series):
        ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))

        tm.assert_series_equal(ts, ts.fillna(method="ffill"))

        ts[2] = np.NaN

        exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="ffill"), exp)

        exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="backfill"), exp)

        exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(value=5), exp)

        msg = "Must specify a fill 'value' or 'method'"
        with pytest.raises(ValueError, match=msg):
            ts.fillna()

        msg = "Cannot specify both 'value' and 'method'"
        with pytest.raises(ValueError, match=msg):
            datetime_series.fillna(value=0, method="ffill")

        # GH#5703
        s1 = Series([np.nan])
        s2 = Series([1])
        result = s1.fillna(s2)
        expected = Series([1.0])
        tm.assert_series_equal(result, expected)
        result = s1.fillna({})
        tm.assert_series_equal(result, s1)
        result = s1.fillna(Series((), dtype=object))
        tm.assert_series_equal(result, s1)
        result = s2.fillna(s1)
        tm.assert_series_equal(result, s2)
        result = s1.fillna({0: 1})
        tm.assert_series_equal(result, expected)
        result = s1.fillna({1: 1})
        tm.assert_series_equal(result, Series([np.nan]))
        result = s1.fillna({0: 1, 1: 1})
        tm.assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}))
        tm.assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
        tm.assert_series_equal(result, s1)

        s1 = Series([0, 1, 2], list("abc"))
        s2 = Series([0, np.nan, 2], list("bac"))
        result = s2.fillna(s1)
        expected = Series([0, 0, 2.0], list("bac"))
        tm.assert_series_equal(result, expected)

        # limit
        ser = Series(np.nan, index=[0, 1, 2])
        result = ser.fillna(999, limit=1)
        expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
        tm.assert_series_equal(result, expected)

        result = ser.fillna(999, limit=2)
        expected = Series([999, 999, np.nan], index=[0, 1, 2])
        tm.assert_series_equal(result, expected)

        # GH#9043
        # make sure a string representation of int/float values can be filled
        # correctly without raising errors or being converted
        vals = ["0", "1.5", "-0.3"]
        for val in vals:
            ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64")
            result = ser.fillna(val)
            expected = Series([0, 1, val, val, 4], dtype="object")
            tm.assert_series_equal(result, expected)
Пример #42
0
    def test_datetime64_tz_fillna(self, tz):
        # DatetimeLikeBlock
        ser = Series([
            Timestamp("2011-01-01 10:00"),
            NaT,
            Timestamp("2011-01-03 10:00"),
            NaT,
        ])
        null_loc = Series([False, True, False, True])

        result = ser.fillna(Timestamp("2011-01-02 10:00"))
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-02 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        # check s is not changed
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz))
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna("AAA")
        expected = Series(
            [
                Timestamp("2011-01-01 10:00"),
                "AAA",
                Timestamp("2011-01-03 10:00"),
                "AAA",
            ],
            dtype=object,
        )
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00"),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00"),
            3: Timestamp("2011-01-04 10:00")
        })
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        # DatetimeTZBlock
        idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT],
                            tz=tz)
        ser = Series(idx)
        assert ser.dtype == f"datetime64[ns, {tz}]"
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-02 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz))
        idx = DatetimeIndex(
            [
                "2011-01-01 10:00",
                "2011-01-02 10:00",
                "2011-01-03 10:00",
                "2011-01-02 10:00",
            ],
            tz=tz,
        )
        expected = Series(idx)
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(
            Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime())
        idx = DatetimeIndex(
            [
                "2011-01-01 10:00",
                "2011-01-02 10:00",
                "2011-01-03 10:00",
                "2011-01-02 10:00",
            ],
            tz=tz,
        )
        expected = Series(idx)
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna("AAA")
        expected = Series(
            [
                Timestamp("2011-01-01 10:00", tz=tz),
                "AAA",
                Timestamp("2011-01-03 10:00", tz=tz),
                "AAA",
            ],
            dtype=object,
        )
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00"),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00", tz=tz),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-04 10:00", tz=tz),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        # filling with a naive/other zone, coerce to object
        result = ser.fillna(Timestamp("20130101"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2013-01-01"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2013-01-01"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("20130101", tz="US/Pacific"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2013-01-01", tz="US/Pacific"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2013-01-01", tz="US/Pacific"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)
Пример #43
0
s2 = Series(arange(1.0,4.0),index=["c","d","e"])
s3 = s1 + s2
s3
s3.dropna()

##############################################################################

#### fillna

##  fillna(value) fills all null values in a series with a specific value.


s1 = Series(arange(1.0,4.0),index=["a","b","c"])
s2 = Series(arange(1.0,4.0),index=["c","d","e"])
s3 = s1 + s2
s3.fillna(1.0)

################

## append
## append(series) appends one series to another, and is similar to list.append.

################

## replace
## replace(list,values) replaces a set of values in a Series with a new value. 
## replace is similar to fillna except that replace also replaces non-null values.

################

## update
Пример #44
0
 def count_words_in_series(series: pd.Series):
     lst = [
         i for i in CounterWrapper.prepare_string(" ".join(
             series.fillna("").values.tolist())).split(" ") if len(i) > 3
     ]
     return Counter(lst)
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index
    
    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))
        
    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))
    
    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))
    
    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))
        
    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
Пример #46
0
# 분포 수를 계산 
print(data.value_counts())

# 수치가 들어있는 확인 # TRUE, FALSE 반환
data.isin(['a', 'b'])

# NaN(Not a Number) 결측치  확인 # TRUE, FALSE 반환
print(data.isnull())

# NA 처리 dropna 함수
print(data.dropna()) # 한개의 NA만 row에 포함되어 있더라도 해당 row 전체 제외함.
print(data.dropna(how = 'all')) # row 전체가 na일 경우 w제외함

# NA 처리 0으로 처리하기
print(data.fillna(0))

# NA 처리 앞에 값으로  처리하기
print(data.fillna(method = 'ffill')) # NA 값은 모두 앞의 값으로 가져와서 NA 처리 하는 경우
print(data.fillna(method = 'ffill', limit = 1)) # 1번만 앞에 값으로 처리하고 그 이후는 NA로 처리할 경우


# NA 처리 평균 값으로  처리하기
print(data.fillna(data.mean())) # 평균 값 


# null  값 제외한 값만 확인하기
print(data[data.notnull()])

# 기간 데이터 생성 (날짜)
pd.date_range(start = '2020-01-01', end = '2020-01-07')
Пример #47
0
    def test_datetime64_tz_fillna(self):
        for tz in ['US/Eastern', 'Asia/Tokyo']:
            # DatetimeBlock
            s = Series([
                Timestamp('2011-01-01 10:00'), pd.NaT,
                Timestamp('2011-01-03 10:00'), pd.NaT
            ])
            null_loc = pd.Series([False, True, False, True])

            result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-02 10:00')
            ])
            self.assert_series_equal(expected, result)
            # check s is not changed
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz)
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna('AAA')
            expected = Series([
                Timestamp('2011-01-01 10:00'), 'AAA',
                Timestamp('2011-01-03 10:00'), 'AAA'
            ],
                              dtype=object)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00'),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            # DatetimeBlockTZ
            idx = pd.DatetimeIndex(
                ['2011-01-01 10:00', pd.NaT, '2011-01-03 10:00', pd.NaT],
                tz=tz)
            s = pd.Series(idx)
            self.assertEqual(s.dtype, 'datetime64[ns, {0}]'.format(tz))
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-02 10:00')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
            idx = pd.DatetimeIndex([
                '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00',
                '2011-01-02 10:00'
            ],
                                   tz=tz)
            expected = Series(idx)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(
                pd.Timestamp('2011-01-02 10:00', tz=tz).to_pydatetime())
            idx = pd.DatetimeIndex([
                '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00',
                '2011-01-02 10:00'
            ],
                                   tz=tz)
            expected = Series(idx)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna('AAA')
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz), 'AAA',
                Timestamp('2011-01-03 10:00', tz=tz), 'AAA'
            ],
                              dtype=object)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00', tz=tz)
            })
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-04 10:00', tz=tz)
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            # filling with a naive/other zone, coerce to object
            result = s.fillna(Timestamp('20130101'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2013-01-01'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2013-01-01')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(Timestamp('20130101', tz='US/Pacific'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2013-01-01', tz='US/Pacific'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2013-01-01', tz='US/Pacific')
            ])
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)
obj.loc[['a', 'c', 'g']]
obj.ix[1:6]
obj.shape
obj.reindex(['f', 'd', 'z'])
obj['z'] = 4
obj.mean()
obj.idxmin()  #how to return an integer position?
obj.index = np.arange(len(obj.index))
obj.idxmax()
obj.sort_index()
obj.sort_values()
obj[obj > 0]
obj.ix[['d', 'f', 'j']] = np.nan
obj[obj.notnull()]
obj.dropna()
obj.fillna(0)
obj.fillna(method='bfill')
obj2 = Series({
    'a': 1,
    'b': 3,
    'f': 5,
    'g': 7,
    'f': 9,
    'h': 10,
    'x': 11,
    'y': 12,
    'z': 10
})
obj + obj2
obj + obj2.sort_index()
obj3 = Series(np.random.randn(11),
Пример #49
0
 def test_dt_accessor_updates_on_inplace(self):
     s = Series(pd.date_range("2018-01-01", periods=10))
     s[2] = None
     s.fillna(pd.Timestamp("2018-01-01"), inplace=True)
     result = s.dt.date
     assert result[0] == result[2]
Пример #50
0
def fillna(input: pd.Series) -> pd.Series:
    """Replace not assigned values with empty spaces."""
    return input.fillna("").astype("str")
Пример #51
0
def answer(request):
    # print(type(request.data))
    # print(request.data)
    DATA_DIR = os.path.join(PROJECT_DIR, 'data')
    #print(DATA_DIR)
    #print('read case number data')
    patient_number_data = pd.read_csv(
        os.path.join(DATA_DIR, "case_number_1025.csv"))[0:77]

    patient_number = pd.Series(patient_number_data['case_number'].values,
                               index=patient_number_data['disease_code'])
    patient_number.name = 'number'

    #print('read case number ratio data')
    patient_ratio_data = pd.read_csv(
        os.path.join(DATA_DIR, "case_number_ratio_1025.csv"))[0:77]
    patient_ratio = pd.Series(patient_ratio_data['case_ratio'].values,
                              index=patient_ratio_data['disease_code'])
    patient_ratio.name = 'ratio'

    #print('read disease data')
    disease_sym_matrix = pd.read_csv(
        os.path.join(DATA_DIR, '77_disease_data_1025.csv'))
    SD_symcode = pd.DataFrame(disease_sym_matrix['symptom_cause'])
    SD_symcode.columns = ['symptom_code']
    symptom_code = disease_sym_matrix['symptom_cause']
    symptom_code.name = '0'
    disease_code = patient_number_data['disease_code']
    disease_code.name = '0'
    disease_number = patient_number_data.shape[0]
    symptom_number = len(symptom_code)
    disease_sym_matrix.set_index(["symptom_cause"], inplace=True)

    # input data information
    jsondata = request.data
    # jsondata = {
    #     "pathtype": "getanswer",
    #     "answer_record_seqno": "201908051619300001938_007",
    #     "content":
    #         {
    #             "answer_mainseqno": "201908051619300001938",
    #             "question_seq": "1",
    #             "actual_symptom_code": ["C0015230"],
    #             "answer_detail": ["是"]
    #         }
    # }
    #print(jsondata["pathtype"])
    content = jsondata['content']
    #print(content)

    python_output = {}
    global_dict = cache.get(('global_dict' + content["answer_mainseqno"]),
                            None)

    # programming
    # interface 1--getanswer
    if jsondata["pathtype"] == "getanswer":
        #print('-'*10 + 'getanswer')
        if content["question_seq"] == "1":
            # establish unit matrix
            global_dict = {}
            global_dict['A' + content["answer_mainseqno"]] = pd.DataFrame(
                data=0,
                columns=["symptom_code", "answer_detail"],
                index=range(1, 21))
            global_dict['A' +
                        content["answer_mainseqno"]]['symptom_code'] = '0'
            global_dict['B' + content["answer_mainseqno"]] = pd.DataFrame(
                data=1,
                columns=range(1, 21),
                index=patient_number.index,
                dtype=float)
        # check the json string whether has wrong information or whether to end(20 question so far).
        if not global_dict:
            result_code = 'FAIL'
            answer_mainseqno = ''
            ifend = ''
            next_symptom_code = []
            next_answer_detail = []
            confirm_disease_code = ''
            confirm_disease_percent = ''
            disease_array = []
            disease_case_number = ''
            errmessage = '整体会话主键不存在'
        elif content["actual_symptom_code"][0] not in list(symptom_code):
            result_code = 'FAIL'
            answer_mainseqno = ''
            ifend = ''
            next_symptom_code = ['']
            next_answer_detail = ['']
            confirm_disease_code = ''
            confirm_disease_percent = ''
            disease_array = []
            disease_case_number = ''
            errmessage = '症状代码不存在'
        else:
            # confirm the correct json string
            # update A_ and B_
            global_dict['A' + content["answer_mainseqno"]].iloc[(int(content["question_seq"]) - 1), 0] = \
            content["actual_symptom_code"][0]
            if content["answer_detail"][0] == "是":
                global_dict['A' + content["answer_mainseqno"]].iloc[(
                    int(content["question_seq"]) - 1), 1] = 1
                global_dict['B' + content["answer_mainseqno"]][int(
                    content["question_seq"])] = disease_sym_matrix.loc[
                        content["actual_symptom_code"][0]]
            elif content["answer_detail"][0] == "否":
                global_dict['A' + content["answer_mainseqno"]].iloc[(
                    int(content["question_seq"]) - 1), 1] = 0
                global_dict['B' + content["answer_mainseqno"]][int(content["question_seq"])] = 1 - \
                                                                                             disease_sym_matrix.loc[
                                                                                                 content[
                                                                                                     "actual_symptom_code"][
                                                                                                     0]]
            else:
                global_dict['A' + content["answer_mainseqno"]].iloc[(
                    int(content["question_seq"]) - 1), 1] = np.nan
                global_dict['B' + content["answer_mainseqno"]][int(
                    content["question_seq"])] = 1
            cache.set(('global_dict' + content["answer_mainseqno"]),
                      global_dict,
                      timeout=86400)
            # establish temporary dataframe,store A and B,perform single-threaded operations
            A = global_dict['A' + content["answer_mainseqno"]].copy()
            A.loc[np.isnan(A['answer_detail']), 'symptom_code'] = '0'
            A.loc[np.isnan(A['answer_detail']), 'answer_detail'] = 0
            B = global_dict['B' + content["answer_mainseqno"]]
            # calculate similar disease case number(disease_case_number) and probability(disease_array)
            # has been removed
            # calculate entropy to get the next question
            H20 = Series([0] * symptom_number)
            H20 = H20.astype(float)
            H21 = Series([0] * symptom_number)
            H21 = H21.astype(float)
            pb20 = Series([0] * symptom_number)
            pb20 = H20.astype(float)
            pb21 = Series([0] * symptom_number)
            pb21 = H20.astype(float)
            B_mul = B[1] * B[2] * B[3] * B[4] * B[5] * B[6] * B[7] * B[8] * B[9] * B[10] * B[11] * B[12] * B[13] * \
                    B[14] * B[15] * B[16] * B[17] * B[18] * B[19]
            for i in range(symptom_number):
                if symptom_code[i] in list(global_dict[
                        'A' + content["answer_mainseqno"]]["symptom_code"]):
                    H20[i] = 100
                    H21[i] = 100
                    pb20[i] = 1
                    pb21[i] = 1
                else:
                    pb20[i] = (B_mul *
                               (1 - disease_sym_matrix.loc[symptom_code[i]]) *
                               patient_number).sum() / (B_mul *
                                                        patient_number).sum()
                    pb21[i] = 1 - pb20[i]
                    if pb21[i] == 0:
                        H20[i] = 10
                        H21[i] = 10
                    else:
                        data_pba20 = B_mul * (
                            1 - disease_sym_matrix.loc[symptom_code[i]])
                        data_mul20 = data_pba20.mul(patient_ratio, axis=0)
                        data_pab20 = data_mul20 / data_mul20.sum()
                        data_pab_d020 = data_pab20[data_pab20 != 0]
                        for j in range(0, len(data_pab_d020)):
                            H20[i] = H20[i] - data_pab_d020[j] * math.log(
                                data_pab_d020[j], disease_number)
                        data_pba21 = B_mul * disease_sym_matrix.loc[
                            symptom_code[i]]
                        data_mul21 = data_pba21.mul(patient_ratio, axis=0)
                        data_pab21 = data_mul21 / data_mul21.sum()
                        data_pab_d021 = data_pab21[data_pab21 != 0]
                        for j in range(0, len(data_pab_d021)):
                            H21[i] = H21[i] - data_pab_d021[j] * math.log(
                                data_pab_d021[j], disease_number)
            H20 = H20.fillna(0)
            H21 = H21.fillna(0)
            H2 = H20 * pb20 + H21 * pb21
            # determine whether end or not.
            if H2.min() < 1 and int(content["question_seq"]) < 20:
                # get the next question(corresponding symptom code) and other output information
                H2_sym = pd.DataFrame({'H2': H2, 'sym': symptom_code})
                next_symptom_code_str = H2_sym['sym'][
                    H2_sym['H2'] == H2_sym['H2'].min()].values[0]
                next_symptom_code = []
                next_symptom_code.append(next_symptom_code_str)
                result_code = 'SUCCESS'
                answer_mainseqno = content["answer_mainseqno"]
                ifend = '0'
                next_answer_detail = ["是", "否", "不确定"]
                confirm_disease_code = ''
                confirm_disease_percent = ''
                disease_array = ['']
                errmessage = ''
            else:
                #get the model file using joblib
                forest_clf_file = "forest_clf_1025.pkl"
                forest_clf = joblib.load(
                    os.path.join(DATA_DIR, forest_clf_file))
                gnb_clf_file = "gnb_clf_1025.pkl"
                gnb_clf = joblib.load(os.path.join(DATA_DIR, gnb_clf_file))
                mnb_clf_file = "mnb_clf_1025.pkl"
                mnb_clf = joblib.load(os.path.join(DATA_DIR, mnb_clf_file))
                bnb_clf_file = "bnb_clf_1025.pkl"
                bnb_clf = joblib.load(os.path.join(DATA_DIR, bnb_clf_file))
                #ensemble learning to predict disease
                Sd = pd.merge(A, SD_symcode, how='right')
                Sd['answer_detail'] = Sd['answer_detail'].fillna(0)
                Sd = Sd.sort_values(by="symptom_code", ascending=True)
                some_digit = np.array(Sd["answer_detail"])
                #first classifier:Bayes method,based on probability
                A_sym = list(A[A["answer_detail"] == 1]["symptom_code"])
                sym_num = len(A_sym)
                Bys_dsm = (disease_sym_matrix.copy()).T
                Bys_dsm['1'] = 1
                Bys_pba = Bys_dsm['1']
                for i in range(sym_num):
                    Bys_pba = Bys_pba * Bys_dsm[A_sym[i]]
                Bys_baa = Bys_pba * patient_ratio
                Bys_pab = Bys_baa / sum(Bys_baa)
                bys_clf_predict = disease_code.copy()
                bys_clf_predict = pd.DataFrame(bys_clf_predict)
                bys_clf_predict.columns = ['bys_clf.classes']
                bys_clf_predict['predict_proba'] = list(Bys_pab)
                #second classifier:Random Forest
                forest_clf_predict_proba1 = pd.DataFrame(
                    forest_clf.predict_proba([some_digit])[0],
                    columns=['predict_proba'])
                forest_clf_classes1 = pd.DataFrame(forest_clf.classes_,
                                                   columns=['sgd_clf.classes'])
                forest_clf_predict = pd.merge(forest_clf_classes1,
                                              forest_clf_predict_proba1,
                                              left_index=True,
                                              right_index=True)
                #third classifier:GaussianNB
                gnb_clf_predict_proba1 = pd.DataFrame(
                    gnb_clf.predict_proba([some_digit])[0],
                    columns=['predict_proba'])
                gnb_clf_classes1 = pd.DataFrame(gnb_clf.classes_,
                                                columns=['gnd_clf.classes'])
                gnb_clf_predict = pd.merge(gnb_clf_classes1,
                                           gnb_clf_predict_proba1,
                                           left_index=True,
                                           right_index=True)
                #fourth:MultinomialNB
                mnb_clf_predict_proba1 = pd.DataFrame(
                    mnb_clf.predict_proba([some_digit])[0],
                    columns=['predict_proba'])
                mnb_clf_classes1 = pd.DataFrame(mnb_clf.classes_,
                                                columns=['mnd_clf.classes'])
                mnb_clf_predict = pd.merge(mnb_clf_classes1,
                                           mnb_clf_predict_proba1,
                                           left_index=True,
                                           right_index=True)
                #fifth:BernoulliNB
                bnb_clf_predict_proba1 = pd.DataFrame(
                    bnb_clf.predict_proba([some_digit])[0],
                    columns=['predict_proba'])
                bnb_clf_classes1 = pd.DataFrame(bnb_clf.classes_,
                                                columns=['bnd_clf.classes'])
                bnb_clf_predict = pd.merge(bnb_clf_classes1,
                                           bnb_clf_predict_proba1,
                                           left_index=True,
                                           right_index=True)
                #ensemble soft voting
                ensemble_proba = (1 / 5) * bys_clf_predict['predict_proba'] + (
                    3 / 5) * forest_clf_predict['predict_proba'] + (
                        1 / 15) * gnb_clf_predict['predict_proba'] + (
                            1 / 15) * mnb_clf_predict['predict_proba'] + (
                                1 / 15) * bnb_clf_predict['predict_proba']
                esm_clf_predict = disease_code.copy()
                esm_clf_predict = pd.DataFrame(esm_clf_predict)
                esm_clf_predict.columns = ['esm_clf.classes']
                esm_clf_predict['predict_proba'] = ensemble_proba * 0.98
                esm_clf_predict = esm_clf_predict.sort_values(
                    by="predict_proba", ascending=False)
                #extract outcome
                result_code = 'SUCCESS'
                answer_mainseqno = content["answer_mainseqno"]
                ifend = '1'
                next_symptom_code = ['']
                next_answer_detail = ['']
                #need to calculate the similarity
                confirm_disease_code = str(
                    list(esm_clf_predict['esm_clf.classes'])[0])
                confirm_disease_percent = str(
                    list(esm_clf_predict['predict_proba'])[0])
                disease_array = [{
                    "disease_code":
                    str(list(esm_clf_predict['esm_clf.classes'])[0]),
                    "disease_percent":
                    str(list(esm_clf_predict['predict_proba'])[0])
                }, {
                    "disease_code":
                    str(list(esm_clf_predict['esm_clf.classes'])[1]),
                    "disease_percent":
                    str(list(esm_clf_predict['predict_proba'])[1])
                }, {
                    "disease_code":
                    str(list(esm_clf_predict['esm_clf.classes'])[2]),
                    "disease_percent":
                    str(list(esm_clf_predict['predict_proba'])[2])
                }]
                errmessage = ''
                cache.delete(('global_dict' + content["answer_mainseqno"]))
                del forest_clf, gnb_clf, mnb_clf, bnb_clf
        python_output = {
            "resultcode": result_code,
            "answer_record_seqno": jsondata["answer_record_seqno"],
            "returnmessage": {
                "answer_mainseqno": answer_mainseqno,
                "ifend": str(ifend),
                "next_symptom_code": next_symptom_code,
                "next_answer_detail": next_answer_detail,
                "confirm_disease_code": confirm_disease_code,
                "confirm_disease_percent": str(confirm_disease_percent),
                "disease_array": disease_array,
                "disease_case_number": ''
            },
            "errmessage": errmessage
        }
        #print('-'*10 + 'getanswer end')

    # interface 2--cancelanswer
    elif jsondata["pathtype"] == "cancelanswer":
        #print('-'*10 + 'cancelanswer')
        if not global_dict:
            result_code = 'FAIL'
            answer_mainseqno = ''
            errmessage = '整体会话主键不存在'
        else:
            global_dict['A' + content["answer_mainseqno"]].iloc[(
                int(content["question_seq"]) - 1), 0] = '0'
            global_dict['A' + content["answer_mainseqno"]].iloc[(
                int(content["question_seq"]) - 1), 1] = 0
            global_dict['B' + content["answer_mainseqno"]][int(
                content["question_seq"])] = 1
            result_code = 'SUCCESS'
            answer_mainseqno = content["answer_mainseqno"]
            errmessage = ''
            cache.set(('global_dict' + content["answer_mainseqno"]),
                      global_dict,
                      timeout=86400)
        python_output = {
            "resultcode": result_code,
            "answer_record_seqno": jsondata["answer_record_seqno"],
            "answer_mainseqno": answer_mainseqno,
            "errmessage": errmessage
        }
        #print('-'*10 + 'cancelanswer end')

    # output the json string
    del patient_number_data, patient_ratio_data, disease_sym_matrix
    json_output = json.dumps(python_output)
    #print(json_output)
    return JsonResponse(python_output, safe=False)
Пример #52
0
def target_discrete_price_variation(pct_var: pd.Series, **kwargs):
    classes = to_discrete_double(pct_var.fillna(method='ffill'), -0.01, 0.01)
    return pd.Series(classes, index=pct_var.index)
Пример #53
0
    def test_datetime64_tz_fillna(self):
        for tz in ['US/Eastern', 'Asia/Tokyo']:
            # DatetimeBlock
            s = Series([
                Timestamp('2011-01-01 10:00'), pd.NaT,
                Timestamp('2011-01-03 10:00'), pd.NaT
            ])
            result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-02 10:00')
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz)
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna('AAA')
            expected = Series([
                Timestamp('2011-01-01 10:00'), 'AAA',
                Timestamp('2011-01-03 10:00'), 'AAA'
            ],
                              dtype=object)
            self.assert_series_equal(expected, result)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00'),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00'),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00'),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)

            # DatetimeBlockTZ
            idx = pd.DatetimeIndex(
                ['2011-01-01 10:00', pd.NaT, '2011-01-03 10:00', pd.NaT],
                tz=tz)
            s = pd.Series(idx)
            result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-02 10:00')
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
            idx = pd.DatetimeIndex([
                '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00',
                '2011-01-02 10:00'
            ],
                                   tz=tz)
            expected = Series(idx)
            self.assert_series_equal(expected, result)

            result = s.fillna(
                pd.Timestamp('2011-01-02 10:00', tz=tz).to_pydatetime())
            idx = pd.DatetimeIndex([
                '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00',
                '2011-01-02 10:00'
            ],
                                   tz=tz)
            expected = Series(idx)
            self.assert_series_equal(expected, result)

            result = s.fillna('AAA')
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz), 'AAA',
                Timestamp('2011-01-03 10:00', tz=tz), 'AAA'
            ],
                              dtype=object)
            self.assert_series_equal(expected, result)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00')
            })
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-04 10:00')
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna({
                1: pd.Timestamp('2011-01-02 10:00', tz=tz),
                3: pd.Timestamp('2011-01-04 10:00', tz=tz)
            })
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2011-01-02 10:00', tz=tz),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2011-01-04 10:00', tz=tz)
            ])
            self.assert_series_equal(expected, result)

            # filling with a naive/other zone, coerce to object
            result = s.fillna(Timestamp('20130101'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2013-01-01'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2013-01-01')
            ])
            self.assert_series_equal(expected, result)

            result = s.fillna(Timestamp('20130101', tz='US/Pacific'))
            expected = Series([
                Timestamp('2011-01-01 10:00', tz=tz),
                Timestamp('2013-01-01', tz='US/Pacific'),
                Timestamp('2011-01-03 10:00', tz=tz),
                Timestamp('2013-01-01', tz='US/Pacific')
            ])
            self.assert_series_equal(expected, result)
Пример #54
0
def target_binary_price_variation(pct_var: pd.Series, **kwargs):
    classes = to_discrete_single(pct_var.fillna(method='ffill'), 0.00)
    return pd.Series(classes, index=pct_var.index)
Пример #55
0
    def test_fillna(self):
        ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))

        self.assert_numpy_array_equal(ts, ts.fillna(method='ffill'))

        ts[2] = np.NaN

        self.assert_numpy_array_equal(ts.fillna(method='ffill'),
                                      [0., 1., 1., 3., 4.])
        self.assert_numpy_array_equal(ts.fillna(method='backfill'),
                                      [0., 1., 3., 3., 4.])

        self.assert_numpy_array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.])

        self.assertRaises(ValueError, ts.fillna)
        self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill')

        # GH 5703
        s1 = Series([np.nan])
        s2 = Series([1])
        result = s1.fillna(s2)
        expected = Series([1.])
        assert_series_equal(result, expected)
        result = s1.fillna({})
        assert_series_equal(result, s1)
        result = s1.fillna(Series(()))
        assert_series_equal(result, s1)
        result = s2.fillna(s1)
        assert_series_equal(result, s2)
        result = s1.fillna({0: 1})
        assert_series_equal(result, expected)
        result = s1.fillna({1: 1})
        assert_series_equal(result, Series([np.nan]))
        result = s1.fillna({0: 1, 1: 1})
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}))
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
        assert_series_equal(result, s1)

        s1 = Series([0, 1, 2], list('abc'))
        s2 = Series([0, np.nan, 2], list('bac'))
        result = s2.fillna(s1)
        expected = Series([0, 0, 2.], list('bac'))
        assert_series_equal(result, expected)

        # limit
        s = Series(np.nan, index=[0, 1, 2])
        result = s.fillna(999, limit=1)
        expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        result = s.fillna(999, limit=2)
        expected = Series([999, 999, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        # GH 9043
        # make sure a string representation of int/float values can be filled
        # correctly without raising errors or being converted
        vals = ['0', '1.5', '-0.3']
        for val in vals:
            s = Series([0, 1, np.nan, np.nan, 4], dtype='float64')
            result = s.fillna(val)
            expected = Series([0, 1, val, val, 4], dtype='object')
            assert_series_equal(result, expected)
Пример #56
0
    def test_fillna_categorical_raises(self):
        data = ["a", np.nan, "b", np.nan, np.nan]
        ser = Series(Categorical(data, categories=["a", "b"]))

        with pytest.raises(ValueError, match="fill value must be in categories"):
            ser.fillna("d")

        with pytest.raises(ValueError, match="fill value must be in categories"):
            ser.fillna(Series("d"))

        with pytest.raises(ValueError, match="fill value must be in categories"):
            ser.fillna({1: "d", 3: "a"})

        msg = '"value" parameter must be a scalar or dict, but you passed a "list"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna(["a", "b"])

        msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna(("a", "b"))

        msg = (
            '"value" parameter must be a scalar, dict '
            'or Series, but you passed a "DataFrame"'
        )
        with pytest.raises(TypeError, match=msg):
            ser.fillna(DataFrame({1: ["a"], 3: ["b"]}))
Пример #57
0
class ESRIAsc:

    def __init__(self, file_path=None, ncols=None, nrows=None,
                 xllcorner=None, yllcorner=None, cellsize=1,
                 NODATA_value=-9999, data=None):

        self.file_path = file_path
        self.ncols = ncols
        self.nrows = nrows
        self.xllcorner = xllcorner
        self.yllcorner = yllcorner
        self.cellsize = cellsize
        self.NODATA_value = NODATA_value
        self.data = data

        # if a file is provided, the file metadata will overwrite any
        # user-provided kwargs
        if file_path:

            getnextval = lambda f: f.readline().strip().split()[1]

            f = open(file_path, 'r')

            self.ncols = int(getnextval(f))
            self.nrows = int(getnextval(f))
            self.xllcorner = float(getnextval(f))
            self.yllcorner = float(getnextval(f))
            self.cellsize = int(getnextval(f))
            self.NODATA_value = float(getnextval(f))

            # should not be necessary for well-formed ESRI files, but
            # seems to be for CASiMiR
            data_str = ' '.join([l.strip() for l in f.readlines()])

            self.data = Series(fromstring(data_str, dtype=float, sep=' '))

            colrow_prod = self.nrows*self.ncols
            assert len(self.data) == colrow_prod, \
                "length of .asc data does not equal product of ncols * nrows" \
                "\nncols: {}, nrows: {}, ncols*nrows: {} len(data): {}".format(
                    self.ncols, self.nrows, colrow_prod, len(self.data))

    def as_matrix(self, replace_nodata_val=None):
        """
        Convenience method to give 2D numpy.ndarray representation. If
        replace_nodata_val is given, replace all NODATA_value entries with
        it.

        Arguments:
            replace_nodata_val (float): value with which to replace
                NODATA_value entries

        Returns:
            (numpy.ndarray) matrix representation of the data in the .asc
        """
        ret = copy.copy(reshape(self.data, (self.nrows, self.ncols)))
        if replace_nodata_val is not None:
            ret[ret == self.NODATA_value] = replace_nodata_val

        return ret

    def write(self, write_path):
        # replace nan with NODATA_value
        self.data = self.data.fillna(self.NODATA_value)

        with open(write_path, 'w+') as f:
            f.write("ncols {}\n".format(self.ncols))
            f.write("nrows {}\n".format(self.nrows))
            f.write("xllcorner {}\n".format(self.xllcorner))
            f.write("yllcorner {}\n".format(self.yllcorner))
            f.write("cellsize {}\n".format(self.cellsize))
            f.write("NODATA_value {}\n".format(self.NODATA_value))

            # prob not most efficient, but CASiMiR requires
            # ESRI Ascii w/ newlines
            f.write(
                '\n'.join(
                    [
                        ' '.join([str(v) for v in row])
                        for row in self.as_matrix()
                    ]
                )
            )

    def __eq__(self, other):

        if isinstance(other, ESRIAsc):
            ret = self.ncols == other.ncols
            ret = self.nrows == other.nrows and ret
            ret = self.xllcorner == other.xllcorner and ret
            ret = self.yllcorner == other.yllcorner and ret
            ret = self.cellsize == other.cellsize and ret
            ret = self.NODATA_value == other.NODATA_value and ret
            ret = all(self.data == other.data) and ret

            return ret

        return NotImplemented
Пример #58
0
 def test_bfill(self):
     ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))
     ts[2] = np.NaN
     assert_series_equal(ts.bfill(), ts.fillna(method='bfill'))