def test_fillna_consistency(self): # GH 16402 # fillna with a tz aware to a tz-naive, should result in object s = Series([Timestamp('20130101'), pd.NaT]) result = s.fillna(Timestamp('20130101', tz='US/Eastern')) expected = Series([Timestamp('20130101'), Timestamp('2013-01-01', tz='US/Eastern')], dtype='object') assert_series_equal(result, expected) # where (we ignore the errors=) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) # with a non-datetime result = s.fillna('foo') expected = Series([Timestamp('20130101'), 'foo']) assert_series_equal(result, expected) # assignment s2 = s.copy() s2[1] = 'foo' assert_series_equal(s2, expected)
def test_pad_nan(self): x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float) x.fillna(method="pad", inplace=True) expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float) assert_series_equal(x[1:], expected[1:]) self.assertTrue(np.isnan(x[0]), np.isnan(expected[0]))
def test_fillna_bug(self): x = Series([nan, 1., nan, 3., nan],['z','a','b','c','d']) filled = x.fillna(method='ffill') expected = Series([nan, 1., 1., 3., 3.], x.index) assert_series_equal(filled, expected) filled = x.fillna(method='bfill') expected = Series([1., 1., 3., 3., nan], x.index) assert_series_equal(filled, expected)
def test_fillna_bug(self): x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) filled = x.fillna(method="ffill") expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index) assert_series_equal(filled, expected) filled = x.fillna(method="bfill") expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index) assert_series_equal(filled, expected)
def test_pad_nan(self): x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], dtype=float) x.fillna(method='pad', inplace=True) expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ['z', 'a', 'b', 'c', 'd'], dtype=float) assert_series_equal(x[1:], expected[1:]) self.assertTrue(np.isnan(x[0]), np.isnan(expected[0]))
def pd_03(): df=DataFrame(np.random.randn(6,3)) df.ix[2:,1]=np.nan df.ix[4:,2]=np.nan print df print df.fillna(method='ffill') print df.fillna(method='ffill',limit=2) data=Series([1.,None,3.5,None,7]) print data.fillna(data.mean()) print df.fillna(df.mean())
def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) pytest.raises(TypeError, s.fillna, [1, 2]) pytest.raises(TypeError, s.fillna, (1, 2)) # related GH 9217, make sure limit is an int and greater than 0 s = Series([1, 2, 3, None]) for limit in [-1, 0, 1., 2.]: for method in ['backfill', 'bfill', 'pad', 'ffill', None]: with pytest.raises(ValueError): s.fillna(1, limit=limit, method=method)
def test_fillna(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) self.assert_(np.array_equal(ts, ts.fillna())) ts[2] = np.NaN self.assert_(np.array_equal(ts.fillna(), [0., 1., 1., 3., 4.])) self.assert_(np.array_equal(ts.fillna(method='backfill'), [0., 1., 3., 3., 4.])) self.assert_(np.array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.]))
def test_fillna(self): # GH 13737 s = Series([pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) res = s.fillna(pd.Period('2012-01', freq='M')) exp = Series([pd.Period('2011-01', freq='M'), pd.Period('2012-01', freq='M')]) tm.assert_series_equal(res, exp) assert res.dtype == 'object' res = s.fillna('XXX') exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) tm.assert_series_equal(res, exp) assert res.dtype == 'object'
def test_fillna_categorical(self, fill_value, expected_output): # GH 17033 # Test fillna for a Categorical series data = ['a', np.nan, 'b', np.nan, np.nan] s = Series(Categorical(data, categories=['a', 'b'])) exp = Series(Categorical(expected_output, categories=['a', 'b'])) tm.assert_series_equal(s.fillna(fill_value), exp)
def test_fill_value_when_combine_const(self): # GH12723 s = Series([0, 1, np.nan, 3, 4, 5]) exp = s.fillna(0).add(2) res = s.add(2, fill_value=0) assert_series_equal(res, exp)
def sliding_freq(dictionary, data_raw, window, key_word): ''' takes a window size in ms and does non-overlapping count (which is freq, since the windows are all the same size) and average ibi. key_word is the key used to access the section in each dictionary value over which the count should occur. returns two dataframes with this information for each 'test' case. ''' import math num = math.trunc(data_raw.index[-1]/window) #get the number of windows to do the counts and averages over. sliding_count = DataFrame(index= (np.arange(num)*window)) #where we're storing the results for each timeseries sliding_mean = DataFrame(index= (np.arange(num)*window)) #where we're storing the results for each timeseries for key, value in dictionary.iteritems(): temp_count = Series(index=(np.arange(num)*window)) #temp storage temp_mean = Series(index=(np.arange(num)*window)) #temp storage for i in (np.arange(num)*window): temp_count[i] = value[key_word][i:(i+window)].count() #get the count in the window temp_mean[i] = value[key_word][i:(i+window)].mean() #get the mean in the window temp_mean = temp_mean.fillna(0) #temp mean returns NaN for windows with no events. make it zero for graphing sliding_count[key] = temp_count #store series in results table sliding_mean[key] = temp_mean #store series in results table sliding_count = sliding_count.sort_index(axis = 1) sliding_mean = sliding_mean.sort_index(axis = 1) #my attempt at reordering so the columns are in increaing order return sliding_mean, sliding_count
def test_endswith(self): values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_endswith(mixed, "f") xp = [False, NA, False, NA, NA, False, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.endswith("f") self.assert_(isinstance(rs, Series)) tm.assert_almost_equal(rs, xp) # unicode values = Series([u"om", NA, u"foo_nom", u"nom", u"bar_foo", NA, u"foo"]) result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_endswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') xp = [False, NA, False, NA, NA, False, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.endswith('f') tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, u('foo')]) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith('foo', na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_fillna_inplace(self): x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) y = x.copy() y.fillna(value=0, inplace=True) expected = x.fillna(value=0) assert_series_equal(y, expected)
def count_enf_born(info_child, index): info_child['enf_born'] = ( info_child['age_enf'] >= 0 )*info_child['nb_enf'] info = info_child.groupby(['id_parent'])['enf_born'].sum().reset_index() info.columns = ['id_parent', 'nb_born'] info.index = info['id_parent'] nb_born= Series(zeros(len(index)), index=index) nb_born += info['nb_born'] return nb_born.fillna(0)
def test_fillna_inplace(self): x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) y = x.copy() y.fillna(value=0, inplace=True) expected = x.fillna(value=0) assert_series_equal(y, expected)
def test_fillna_nat(self): series = Series([0, 1, 2, NaT], dtype='M8[ns]') filled = series.fillna(method='pad') filled2 = series.fillna(value=series.values[2]) expected = series.copy() expected.values[3] = expected.values[2] assert_series_equal(filled, expected) assert_series_equal(filled2, expected) df = DataFrame({'A': series}) filled = df.fillna(method='pad') filled2 = df.fillna(value=series.values[2]) expected = DataFrame({'A': expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected)
def test_fillna_nat(self): series = Series([0, 1, 2, NaT], dtype="M8[us]") filled = series.fillna(method="pad") filled2 = series.fillna(value=series[2]) expected = series.copy() expected[3] = expected[2] assert_series_equal(filled, expected) assert_series_equal(filled2, expected) df = DataFrame({"A": series}) filled = df.fillna(method="pad") filled2 = df.fillna(value=series[2]) expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected)
def test_datetime64_fillna(self): s = Series([Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01")]) s[2] = np.nan # reg fillna result = s.fillna(Timestamp("20130104")) expected = Series( [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130104"), Timestamp("20130103 9:01:01")] ) assert_series_equal(result, expected) from pandas import tslib result = s.fillna(tslib.NaT) expected = s assert_series_equal(result, expected) # ffill result = s.ffill() expected = Series( [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01")] ) assert_series_equal(result, expected) # bfill result = s.bfill() expected = Series( [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01"), Timestamp("20130103 9:01:01")] ) assert_series_equal(result, expected) # GH 6587 # make sure that we are treating as integer when filling # this also tests inference of a datetime-like with NaT's s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) expected = Series( ["2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001"], dtype="M8[ns]" ) result = s.fillna(method="backfill") assert_series_equal(result, expected)
def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) ser[::2] = np.nan mask = ser.isna() filled = ser.fillna(ser[0]) result = bool_op(ser < ser[9], ser > ser[3]) expected = bool_op(filled < filled[9], filled > filled[3]) expected[mask] = False assert_series_equal(result, expected)
def test_datetime64_fillna(self): s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( '20130102'), Timestamp('20130103 9:01:01')]) s[2] = np.nan # reg fillna result = s.fillna(Timestamp('20130104')) expected = Series([Timestamp('20130101'), Timestamp( '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')]) assert_series_equal(result, expected) result = s.fillna(NaT) expected = s assert_series_equal(result, expected) # ffill result = s.ffill() expected = Series([Timestamp('20130101'), Timestamp( '20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01')]) assert_series_equal(result, expected) # bfill result = s.bfill() expected = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01'), Timestamp( '20130103 9:01:01')]) assert_series_equal(result, expected) # GH 6587 # make sure that we are treating as integer when filling # this also tests inference of a datetime-like with NaT's s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) expected = Series( ['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001'], dtype='M8[ns]') result = s.fillna(method='backfill') assert_series_equal(result, expected)
def main(): """ Handling of not applicable values """ string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) print string_data print string_data.isnull() string_data[0] = None print string_data.isnull() print None is np.nan, None == np.nan # not same # Exclude N/A print '','' NA = np.nan data = Series([1, NA, 3.5, NA, 7]) print data.dropna() print data[data.notnull()] data = DataFrame([ [1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.] ]) cleaned = data.dropna() # row that all value is not NA print data print cleaned print data.dropna(how='all') data[4] = None print data.dropna(axis=1, how='all') print data.dropna(thresh=2) # non NA is more 2 # Fill NA print '','' print data.fillna(0) print data.fillna({1: 0.5, 2: -1}) _ = data.fillna(0, inplace=True) print data print '','' df = DataFrame(np.arange(18).reshape((6, 3))) df.ix[2:, 1] = NA; df.ix[4:, 2] = NA print df print df.fillna(method='ffill') print df.fillna(method='ffill', limit=2) data = Series([1., NA, 3.5, NA, 7]) print data.fillna(data.mean())
def test_comparison_operators_with_nas(self): s = Series(bdate_range('1/1/2000', periods=10), dtype=object) s[::2] = np.nan # test that comparisons work ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] for op in ops: val = s[5] f = getattr(operator, op) result = f(s, val) expected = f(s.dropna(), val).reindex(s.index) if op == 'ne': expected = expected.fillna(True).astype(bool) else: expected = expected.fillna(False).astype(bool) assert_series_equal(result, expected) # fffffffuuuuuuuuuuuu # result = f(val, s) # expected = f(val, s.dropna()).reindex(s.index) # assert_series_equal(result, expected) # boolean &, |, ^ should work with object arrays and propagate NAs ops = ['and_', 'or_', 'xor'] mask = s.isnull() for bool_op in ops: f = getattr(operator, bool_op) filled = s.fillna(s[0]) result = f(s < s[9], s > s[3]) expected = f(filled < filled[9], filled > filled[3]) expected[mask] = False assert_series_equal(result, expected)
def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) s = s.reindex(s.index.insert(0, "A")) tm.assert_series_equal( s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) result = s.fillna(timedelta(1)) expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(result, expected) s.loc["A"] = timedelta(1) tm.assert_series_equal(s, expected) # GH 14155 s = Series(10 * [np.timedelta64(10, "m")]) s.loc[[1, 2, 3]] = np.timedelta64(20, "m") expected = pd.Series(10 * [np.timedelta64(10, "m")]) expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m")) tm.assert_series_equal(s, expected)
def test_timedelta_assignment(): # GH 8209 s = Series([]) s.loc['B'] = timedelta(1) tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) s = s.reindex(s.index.insert(0, 'A')) tm.assert_series_equal( s, Series([np.nan, Timedelta('1 days')], index=['A', 'B'])) result = s.fillna(timedelta(1)) expected = Series(Timedelta('1 days'), index=['A', 'B']) tm.assert_series_equal(result, expected) s.loc['A'] = timedelta(1) tm.assert_series_equal(s, expected) # GH 14155 s = Series(10 * [np.timedelta64(10, 'm')]) s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') expected = pd.Series(10 * [np.timedelta64(10, 'm')]) expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) tm.assert_series_equal(s, expected)
def test_timedelta_assignment(): # GH 8209 s = Series([]) s.loc['B'] = timedelta(1) tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) s = s.reindex(s.index.insert(0, 'A')) tm.assert_series_equal(s, Series( [np.nan, Timedelta('1 days')], index=['A', 'B'])) result = s.fillna(timedelta(1)) expected = Series(Timedelta('1 days'), index=['A', 'B']) tm.assert_series_equal(result, expected) s.loc['A'] = timedelta(1) tm.assert_series_equal(s, expected) # GH 14155 s = Series(10 * [np.timedelta64(10, 'm')]) s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') expected = pd.Series(10 * [np.timedelta64(10, 'm')]) expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) tm.assert_series_equal(s, expected)
def _normalize_names(self, names: pd.Series) -> pd.Series: """Take names and run a normalization routine""" # Make a transalation table of unwanted characers unwanted_characters = ( string.digits + string.punctuation + string.whitespace ) # Remove unwanted characters efficiently translation_table = str.maketrans('', '', unwanted_characters) # Run our string operations (remember NAN is a valid name) output = ( names.fillna('') .astype(str) .str.translate(translation_table) .str.upper() .str.replace(r'\s?J\.*?R\.*\s*?$', '', regex=True) .str.replace(r'\s?S\.*?R\.*\s*?$', '', regex=True) .str.replace(r'\s?III\s*?$', '', regex=True) .str.replace(r'\s?IV\s*?$', '', regex=True) ) output.name = 'name' return output
def test_fillna_float_casting(self, dtype, fill_type, scalar): # GH-43424 ser = Series([np.nan, 1.2], dtype=dtype) fill_values = Series([2, 2], dtype=fill_type) if scalar: fill_values = fill_values.dtype.type(2) result = ser.fillna(fill_values) expected = Series([2.0, 1.2], dtype=dtype) tm.assert_series_equal(result, expected) ser = Series([np.nan, 1.2], dtype=dtype) mask = ser.isna().to_numpy() ser[mask] = fill_values tm.assert_series_equal(ser, expected) ser = Series([np.nan, 1.2], dtype=dtype) ser.mask(mask, fill_values, inplace=True) tm.assert_series_equal(ser, expected) ser = Series([np.nan, 1.2], dtype=dtype) res = ser.where(~mask, fill_values) tm.assert_series_equal(res, expected)
def binned_sampling(values: pd.Series, feature_list: List[str], ctrl_size: int, n_bins: int, rand_seed: int) -> List[str]: """ Score a set of genes [Satija15]_. The score is the average expression of a set of genes subtracted with the average expression of a reference set of genes. The reference set is randomly sampled from the `gene_pool` for each binned expression value. This reproduces the approach in Seurat [Satija15]_ and has been implemented for Scanpy by Davide Cittaro. This function is adapted from Scanpy's `score_genes`. Args: values: The values for the features. feature_list: The list of features to use for score calculation. ctrl_size: Number of reference features to be sampled from each bin. n_bins: Number of bins for sampling. rand_seed: The seed to use for the random number generation. Returns: A list of sampled features. """ n_items = int(np.round(len(values) / (n_bins - 1))) feature_list = set(feature_list) # Made following more linter friendly # obs_cut = obs_avg.rank(method='min') // n_items obs_cut: pd.Series = values.fillna(0).rank( method='min').divide(n_items).astype(int) control_genes = set() for cut in np.unique(obs_cut[feature_list]): # Replaced np.random.shuffle with pandas' sample method r_genes = obs_cut[obs_cut == cut].sample(n=ctrl_size, random_state=rand_seed).index control_genes.update(set(r_genes)) return list(control_genes - feature_list)
def _prepare_data_dtypes(self, series: pd.Series) -> pd.Series: """ Подготовка данных для передачи данных на вход encoder'a: - замена пропусков на fill_value; - преобразованеи столбца значений в object-столбец. Parameters: ----------- series: pandas.Series Вектор наблюдений. Returns: -------- series_prepared: pandas.Series Преобразованный вектор наблюдений. """ try: if series.dtype == 'category': series = series.cat.add_categories(self.fill_value) except TypeError: pass series_prepared = series.fillna(self.fill_value) series_prepared = series_prepared.astype("str") return series_prepared
def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) # related GH 9217, make sure limit is an int and greater than 0 s = Series([1, 2, 3, None]) msg = (r"Cannot specify both 'value' and 'method'\.|" r"Limit must be greater than 0|" "Limit must be an integer") for limit in [-1, 0, 1.0, 2.0]: for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): s.fillna(1, limit=limit, method=method)
def test_fillna(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) tm.assert_series_equal(ts, ts.fillna(method="ffill")) ts[2] = np.NaN exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="ffill"), exp) exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="backfill"), exp) exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) msg = "Must specify a fill 'value' or 'method'" with pytest.raises(ValueError, match=msg): ts.fillna()
def test_fillna(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) self.assert_numpy_array_equal(ts, ts.fillna(method='ffill')) ts[2] = np.NaN self.assert_numpy_array_equal(ts.fillna(method='ffill'), [0., 1., 1., 3., 4.]) self.assert_numpy_array_equal(ts.fillna(method='backfill'), [0., 1., 3., 3., 4.]) self.assert_numpy_array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.]) self.assertRaises(ValueError, ts.fillna) self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill') # GH 5703 s1 = Series([np.nan]) s2 = Series([1]) result = s1.fillna(s2) expected = Series([1.]) assert_series_equal(result, expected) result = s1.fillna({}) assert_series_equal(result, s1) result = s1.fillna(Series(())) assert_series_equal(result, s1) result = s2.fillna(s1) assert_series_equal(result, s2) result = s1.fillna({0: 1}) assert_series_equal(result, expected) result = s1.fillna({1: 1}) assert_series_equal(result, Series([np.nan])) result = s1.fillna({0: 1, 1: 1}) assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1})) assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) assert_series_equal(result, s1) s1 = Series([0, 1, 2], list('abc')) s2 = Series([0, np.nan, 2], list('bac')) result = s2.fillna(s1) expected = Series([0, 0, 2.], list('bac')) assert_series_equal(result, expected) # limit s = Series(np.nan, index=[0, 1, 2]) result = s.fillna(999, limit=1) expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) assert_series_equal(result, expected) result = s.fillna(999, limit=2) expected = Series([999, 999, np.nan], index=[0, 1, 2]) assert_series_equal(result, expected) # GH 9043 # make sure a string representation of int/float values can be filled # correctly without raising errors or being converted vals = ['0', '1.5', '-0.3'] for val in vals: s = Series([0, 1, np.nan, np.nan, 4], dtype='float64') result = s.fillna(val) expected = Series([0, 1, val, val, 4], dtype='object') assert_series_equal(result, expected)
def test_bfill(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) ts[2] = np.NaN assert_series_equal(ts.bfill(), ts.fillna(method='bfill'))
def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) assert_series_equal(s.fillna(method='ffill', inplace=False), s)
def ebsw(close, length=None, bars=None, offset=None, **kwargs): """Indicator: Even Better SineWave (EBSW)""" # Validate arguments length = int(length) if length and length > 38 else 40 bars = int(bars) if bars and bars > 0 else 10 close = verify_series(close, length) offset = get_offset(offset) if close is None: return # variables alpha1 = HP = 0 # alpha and HighPass a1 = b1 = c1 = c2 = c3 = 0 Filt = Pwr = Wave = 0 lastClose = lastHP = 0 FilterHist = [0, 0] # Filter history # Calculate Result m = close.size result = [npNaN for _ in range(0, length - 1)] + [0] for i in range(length, m): # HighPass filter cyclic components whose periods are shorter than Duration input alpha1 = (1 - npSin(360 / length)) / npCos(360 / length) HP = 0.5 * (1 + alpha1) * (close[i] - lastClose) + alpha1 * lastHP # Smooth with a Super Smoother Filter from equation 3-3 a1 = npExp(-npSqrt(2) * npPi / bars) b1 = 2 * a1 * npCos(npSqrt(2) * 180 / bars) c2 = b1 c3 = -1 * a1 * a1 c1 = 1 - c2 - c3 Filt = c1 * (HP + lastHP) / 2 + c2 * FilterHist[1] + c3 * FilterHist[0] # Filt = float("{:.8f}".format(float(Filt))) # to fix for small scientific notations, the big ones fail # 3 Bar average of Wave amplitude and power Wave = (Filt + FilterHist[1] + FilterHist[0]) / 3 Pwr = (Filt * Filt + FilterHist[1] * FilterHist[1] + FilterHist[0] * FilterHist[0]) / 3 # Normalize the Average Wave to Square Root of the Average Power Wave = Wave / npSqrt(Pwr) # update storage, result FilterHist.append(Filt) # append new Filt value FilterHist.pop( 0) # remove first element of list (left) -> updating/trim lastHP = HP lastClose = close[i] result.append(Wave) ebsw = Series(result, index=close.index) # Offset if offset != 0: ebsw = ebsw.shift(offset) # Handle fills if "fillna" in kwargs: ebsw.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: ebsw.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it ebsw.name = f"EBSW_{length}_{bars}" ebsw.category = "cycles" return ebsw
df.fillna(0) df.fillna({1: 0.5, 3: -1}) df.fillna({1: 0.5, 2: -1}) df.fillna(0, inplace=True) df df = DataFrame(np.random.randn(6, 3)) df.iloc[2:, 1] = NA df.iloc[4:, 2] = NA df df.fillna(method='ffill') df.fillna(method='ffill', limit=2) data = Series([1., NA, 3.5, NA, 7]) data.fillna(data.mean()) ### Hierarchical Indexing -- adding higher dimensionality -- no examples ### read_csv et al df = pd.read_csv('ex1.csv') df pd.read_table('ex1.csv', sep=',') pd.read_csv('ex2.csv', header=None) pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message']) names = ['a', 'b', 'c', 'd', 'message'] pd.read_csv('ex2.csv', names=names, index_col='message') result = pd.read_table('ex3.txt', sep='\s+')
def test_fillna_int(self): ser = Series(np.random.randint(-100, 100, 50)) return_value = ser.fillna(method="ffill", inplace=True) assert return_value is None tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser)
def test_bfill(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) ts[2] = np.NaN tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill"))
def test_fillna(self, datetime_series): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) tm.assert_series_equal(ts, ts.fillna(method="ffill")) ts[2] = np.NaN exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="ffill"), exp) exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="backfill"), exp) exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) msg = "Must specify a fill 'value' or 'method'" with pytest.raises(ValueError, match=msg): ts.fillna() msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): datetime_series.fillna(value=0, method="ffill") # GH#5703 s1 = Series([np.nan]) s2 = Series([1]) result = s1.fillna(s2) expected = Series([1.0]) tm.assert_series_equal(result, expected) result = s1.fillna({}) tm.assert_series_equal(result, s1) result = s1.fillna(Series((), dtype=object)) tm.assert_series_equal(result, s1) result = s2.fillna(s1) tm.assert_series_equal(result, s2) result = s1.fillna({0: 1}) tm.assert_series_equal(result, expected) result = s1.fillna({1: 1}) tm.assert_series_equal(result, Series([np.nan])) result = s1.fillna({0: 1, 1: 1}) tm.assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1})) tm.assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) tm.assert_series_equal(result, s1) s1 = Series([0, 1, 2], list("abc")) s2 = Series([0, np.nan, 2], list("bac")) result = s2.fillna(s1) expected = Series([0, 0, 2.0], list("bac")) tm.assert_series_equal(result, expected) # limit ser = Series(np.nan, index=[0, 1, 2]) result = ser.fillna(999, limit=1) expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) tm.assert_series_equal(result, expected) result = ser.fillna(999, limit=2) expected = Series([999, 999, np.nan], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # GH#9043 # make sure a string representation of int/float values can be filled # correctly without raising errors or being converted vals = ["0", "1.5", "-0.3"] for val in vals: ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64") result = ser.fillna(val) expected = Series([0, 1, val, val, 4], dtype="object") tm.assert_series_equal(result, expected)
def test_datetime64_tz_fillna(self, tz): # DatetimeLikeBlock ser = Series([ Timestamp("2011-01-01 10:00"), NaT, Timestamp("2011-01-03 10:00"), NaT, ]) null_loc = Series([False, True, False, True]) result = ser.fillna(Timestamp("2011-01-02 10:00")) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00"), ]) tm.assert_series_equal(expected, result) # check s is not changed tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna("AAA") expected = Series( [ Timestamp("2011-01-01 10:00"), "AAA", Timestamp("2011-01-03 10:00"), "AAA", ], dtype=object, ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00"), }) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00"), 3: Timestamp("2011-01-04 10:00") }) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # DatetimeTZBlock idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) ser = Series(idx) assert ser.dtype == f"datetime64[ns, {tz}]" tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-02 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) idx = DatetimeIndex( [ "2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00", ], tz=tz, ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna( Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) idx = DatetimeIndex( [ "2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00", ], tz=tz, ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna("AAA") expected = Series( [ Timestamp("2011-01-01 10:00", tz=tz), "AAA", Timestamp("2011-01-03 10:00", tz=tz), "AAA", ], dtype=object, ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00"), }) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00", tz=tz), }) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00", tz=tz), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # filling with a naive/other zone, coerce to object result = ser.fillna(Timestamp("20130101")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2013-01-01"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("20130101", tz="US/Pacific")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc)
s2 = Series(arange(1.0,4.0),index=["c","d","e"]) s3 = s1 + s2 s3 s3.dropna() ############################################################################## #### fillna ## fillna(value) fills all null values in a series with a specific value. s1 = Series(arange(1.0,4.0),index=["a","b","c"]) s2 = Series(arange(1.0,4.0),index=["c","d","e"]) s3 = s1 + s2 s3.fillna(1.0) ################ ## append ## append(series) appends one series to another, and is similar to list.append. ################ ## replace ## replace(list,values) replaces a set of values in a Series with a new value. ## replace is similar to fillna except that replace also replaces non-null values. ################ ## update
def count_words_in_series(series: pd.Series): lst = [ i for i in CounterWrapper.prepare_string(" ".join( series.fillna("").values.tolist())).split(" ") if len(i) > 3 ] return Counter(lst)
class MySeries: def __init__(self, *args, **kwargs): self.x = Series(*args, **kwargs) self.values = self.x.values self.index = self.x.index def rolling_mean(self, *args, **kwargs): return MySeries(pd.rolling_mean(self.x, *args, **kwargs)) def rolling_count(self, *args, **kwargs): return MySeries(pd.rolling_count(self.x, *args, **kwargs)) def rolling_sum(self, *args, **kwargs): return MySeries(pd.rolling_sum(self.x, *args, **kwargs)) def rolling_median(self, *args, **kwargs): return MySeries(pd.rolling_median(self.x, *args, **kwargs)) def rolling_min(self, *args, **kwargs): return MySeries(pd.rolling_min(self.x, *args, **kwargs)) def rolling_max(self, *args, **kwargs): return MySeries(pd.rolling_max(self.x, *args, **kwargs)) def rolling_std(self, *args, **kwargs): return MySeries(pd.rolling_std(self.x, *args, **kwargs)) def rolling_var(self, *args, **kwargs): return MySeries(pd.rolling_var(self.x, *args, **kwargs)) def rolling_skew(self, *args, **kwargs): return MySeries(pd.rolling_skew(self.x, *args, **kwargs)) def rolling_kurtosis(self, *args, **kwargs): return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs)) def rolling_window(self, *args, **kwargs): return MySeries(pd.rolling_window(self.x, *args, **kwargs)) def cumprod(self, *args, **kwargs): return MySeries(self.x.cumprod(*args, **kwargs)) def cumsum(self, *args, **kwargs): return MySeries(self.x.cumsum(*args, **kwargs)) def diff(self, *args, **kwargs): return MySeries(self.x.diff(*args, **kwargs)) def div(self, *args, **kwargs): return MySeries(self.x.div(*args, **kwargs)) def mul(self, *args, **kwargs): return MySeries(self.x.mul(*args, **kwargs)) def add(self, *args, **kwargs): return MySeries(self.x.add(*args, **kwargs)) def dropna(self, *args, **kwargs): return MySeries(self.x.dropna(*args, **kwargs)) def fillna(self, *args, **kwargs): return MySeries(self.x.fillna(*args, **kwargs)) def floordiv(self, *args, **kwargs): return MySeries(self.x.floordiv(*args, **kwargs)) def mod(self, *args, **kwargs): return MySeries(self.x.mod(*args, **kwargs)) def nlargest(self, *args, **kwargs): return MySeries(self.x.nlargest(*args, **kwargs)) def nonzero(self, *args, **kwargs): return MySeries(self.x.nonzero(*args, **kwargs)) def nsmallest(self, *args, **kwargs): return MySeries(self.x.nsmallest(*args, **kwargs)) def pow(self, *args, **kwargs): return MySeries(self.x.pow(*args, **kwargs)) def rank(self, *args, **kwargs): return MySeries(self.x.rank(*args, **kwargs)) def round(self, *args, **kwargs): return MySeries(self.x.round(*args, **kwargs)) def shift(self, *args, **kwargs): return MySeries(self.x.shift(*args, **kwargs)) def sub(self, *args, **kwargs): return MySeries(self.x.sub(*args, **kwargs)) def abs(self, *args, **kwargs): return MySeries(self.x.abs(*args, **kwargs)) def clip(self, *args, **kwargs): return MySeries(self.x.clip(*args, **kwargs)) def clip_lower(self, *args, **kwargs): return MySeries(self.x.clip_lower(*args, **kwargs)) def clip_upper(self, *args, **kwargs): return MySeries(self.x.clip_upper(*args, **kwargs)) def interpolate(self, *args, **kwargs): return MySeries(self.x.interpolate(*args, **kwargs)) def resample(self, *args, **kwargs): return MySeries(self.x.resample(*args, **kwargs)) def replace(self, *args, **kwargs): return MySeries(self.x.replace(*args, **kwargs))
# 분포 수를 계산 print(data.value_counts()) # 수치가 들어있는 확인 # TRUE, FALSE 반환 data.isin(['a', 'b']) # NaN(Not a Number) 결측치 확인 # TRUE, FALSE 반환 print(data.isnull()) # NA 처리 dropna 함수 print(data.dropna()) # 한개의 NA만 row에 포함되어 있더라도 해당 row 전체 제외함. print(data.dropna(how = 'all')) # row 전체가 na일 경우 w제외함 # NA 처리 0으로 처리하기 print(data.fillna(0)) # NA 처리 앞에 값으로 처리하기 print(data.fillna(method = 'ffill')) # NA 값은 모두 앞의 값으로 가져와서 NA 처리 하는 경우 print(data.fillna(method = 'ffill', limit = 1)) # 1번만 앞에 값으로 처리하고 그 이후는 NA로 처리할 경우 # NA 처리 평균 값으로 처리하기 print(data.fillna(data.mean())) # 평균 값 # null 값 제외한 값만 확인하기 print(data[data.notnull()]) # 기간 데이터 생성 (날짜) pd.date_range(start = '2020-01-01', end = '2020-01-07')
def test_datetime64_tz_fillna(self): for tz in ['US/Eastern', 'Asia/Tokyo']: # DatetimeBlock s = Series([ Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-03 10:00'), pd.NaT ]) null_loc = pd.Series([False, True, False, True]) result = s.fillna(pd.Timestamp('2011-01-02 10:00')) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-02 10:00') ]) self.assert_series_equal(expected, result) # check s is not changed self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-02 10:00', tz=tz) ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna('AAA') expected = Series([ Timestamp('2011-01-01 10:00'), 'AAA', Timestamp('2011-01-03 10:00'), 'AAA' ], dtype=object) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00'), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) # DatetimeBlockTZ idx = pd.DatetimeIndex( ['2011-01-01 10:00', pd.NaT, '2011-01-03 10:00', pd.NaT], tz=tz) s = pd.Series(idx) self.assertEqual(s.dtype, 'datetime64[ns, {0}]'.format(tz)) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna(pd.Timestamp('2011-01-02 10:00')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-02 10:00') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) idx = pd.DatetimeIndex([ '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00', '2011-01-02 10:00' ], tz=tz) expected = Series(idx) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna( pd.Timestamp('2011-01-02 10:00', tz=tz).to_pydatetime()) idx = pd.DatetimeIndex([ '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00', '2011-01-02 10:00' ], tz=tz) expected = Series(idx) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna('AAA') expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), 'AAA', Timestamp('2011-01-03 10:00', tz=tz), 'AAA' ], dtype=object) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00', tz=tz) }) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-04 10:00', tz=tz) ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) # filling with a naive/other zone, coerce to object result = s.fillna(Timestamp('20130101')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2013-01-01'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2013-01-01') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) result = s.fillna(Timestamp('20130101', tz='US/Pacific')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2013-01-01', tz='US/Pacific'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2013-01-01', tz='US/Pacific') ]) self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc)
obj.loc[['a', 'c', 'g']] obj.ix[1:6] obj.shape obj.reindex(['f', 'd', 'z']) obj['z'] = 4 obj.mean() obj.idxmin() #how to return an integer position? obj.index = np.arange(len(obj.index)) obj.idxmax() obj.sort_index() obj.sort_values() obj[obj > 0] obj.ix[['d', 'f', 'j']] = np.nan obj[obj.notnull()] obj.dropna() obj.fillna(0) obj.fillna(method='bfill') obj2 = Series({ 'a': 1, 'b': 3, 'f': 5, 'g': 7, 'f': 9, 'h': 10, 'x': 11, 'y': 12, 'z': 10 }) obj + obj2 obj + obj2.sort_index() obj3 = Series(np.random.randn(11),
def test_dt_accessor_updates_on_inplace(self): s = Series(pd.date_range("2018-01-01", periods=10)) s[2] = None s.fillna(pd.Timestamp("2018-01-01"), inplace=True) result = s.dt.date assert result[0] == result[2]
def fillna(input: pd.Series) -> pd.Series: """Replace not assigned values with empty spaces.""" return input.fillna("").astype("str")
def answer(request): # print(type(request.data)) # print(request.data) DATA_DIR = os.path.join(PROJECT_DIR, 'data') #print(DATA_DIR) #print('read case number data') patient_number_data = pd.read_csv( os.path.join(DATA_DIR, "case_number_1025.csv"))[0:77] patient_number = pd.Series(patient_number_data['case_number'].values, index=patient_number_data['disease_code']) patient_number.name = 'number' #print('read case number ratio data') patient_ratio_data = pd.read_csv( os.path.join(DATA_DIR, "case_number_ratio_1025.csv"))[0:77] patient_ratio = pd.Series(patient_ratio_data['case_ratio'].values, index=patient_ratio_data['disease_code']) patient_ratio.name = 'ratio' #print('read disease data') disease_sym_matrix = pd.read_csv( os.path.join(DATA_DIR, '77_disease_data_1025.csv')) SD_symcode = pd.DataFrame(disease_sym_matrix['symptom_cause']) SD_symcode.columns = ['symptom_code'] symptom_code = disease_sym_matrix['symptom_cause'] symptom_code.name = '0' disease_code = patient_number_data['disease_code'] disease_code.name = '0' disease_number = patient_number_data.shape[0] symptom_number = len(symptom_code) disease_sym_matrix.set_index(["symptom_cause"], inplace=True) # input data information jsondata = request.data # jsondata = { # "pathtype": "getanswer", # "answer_record_seqno": "201908051619300001938_007", # "content": # { # "answer_mainseqno": "201908051619300001938", # "question_seq": "1", # "actual_symptom_code": ["C0015230"], # "answer_detail": ["是"] # } # } #print(jsondata["pathtype"]) content = jsondata['content'] #print(content) python_output = {} global_dict = cache.get(('global_dict' + content["answer_mainseqno"]), None) # programming # interface 1--getanswer if jsondata["pathtype"] == "getanswer": #print('-'*10 + 'getanswer') if content["question_seq"] == "1": # establish unit matrix global_dict = {} global_dict['A' + content["answer_mainseqno"]] = pd.DataFrame( data=0, columns=["symptom_code", "answer_detail"], index=range(1, 21)) global_dict['A' + content["answer_mainseqno"]]['symptom_code'] = '0' global_dict['B' + content["answer_mainseqno"]] = pd.DataFrame( data=1, columns=range(1, 21), index=patient_number.index, dtype=float) # check the json string whether has wrong information or whether to end(20 question so far). if not global_dict: result_code = 'FAIL' answer_mainseqno = '' ifend = '' next_symptom_code = [] next_answer_detail = [] confirm_disease_code = '' confirm_disease_percent = '' disease_array = [] disease_case_number = '' errmessage = '整体会话主键不存在' elif content["actual_symptom_code"][0] not in list(symptom_code): result_code = 'FAIL' answer_mainseqno = '' ifend = '' next_symptom_code = [''] next_answer_detail = [''] confirm_disease_code = '' confirm_disease_percent = '' disease_array = [] disease_case_number = '' errmessage = '症状代码不存在' else: # confirm the correct json string # update A_ and B_ global_dict['A' + content["answer_mainseqno"]].iloc[(int(content["question_seq"]) - 1), 0] = \ content["actual_symptom_code"][0] if content["answer_detail"][0] == "是": global_dict['A' + content["answer_mainseqno"]].iloc[( int(content["question_seq"]) - 1), 1] = 1 global_dict['B' + content["answer_mainseqno"]][int( content["question_seq"])] = disease_sym_matrix.loc[ content["actual_symptom_code"][0]] elif content["answer_detail"][0] == "否": global_dict['A' + content["answer_mainseqno"]].iloc[( int(content["question_seq"]) - 1), 1] = 0 global_dict['B' + content["answer_mainseqno"]][int(content["question_seq"])] = 1 - \ disease_sym_matrix.loc[ content[ "actual_symptom_code"][ 0]] else: global_dict['A' + content["answer_mainseqno"]].iloc[( int(content["question_seq"]) - 1), 1] = np.nan global_dict['B' + content["answer_mainseqno"]][int( content["question_seq"])] = 1 cache.set(('global_dict' + content["answer_mainseqno"]), global_dict, timeout=86400) # establish temporary dataframe,store A and B,perform single-threaded operations A = global_dict['A' + content["answer_mainseqno"]].copy() A.loc[np.isnan(A['answer_detail']), 'symptom_code'] = '0' A.loc[np.isnan(A['answer_detail']), 'answer_detail'] = 0 B = global_dict['B' + content["answer_mainseqno"]] # calculate similar disease case number(disease_case_number) and probability(disease_array) # has been removed # calculate entropy to get the next question H20 = Series([0] * symptom_number) H20 = H20.astype(float) H21 = Series([0] * symptom_number) H21 = H21.astype(float) pb20 = Series([0] * symptom_number) pb20 = H20.astype(float) pb21 = Series([0] * symptom_number) pb21 = H20.astype(float) B_mul = B[1] * B[2] * B[3] * B[4] * B[5] * B[6] * B[7] * B[8] * B[9] * B[10] * B[11] * B[12] * B[13] * \ B[14] * B[15] * B[16] * B[17] * B[18] * B[19] for i in range(symptom_number): if symptom_code[i] in list(global_dict[ 'A' + content["answer_mainseqno"]]["symptom_code"]): H20[i] = 100 H21[i] = 100 pb20[i] = 1 pb21[i] = 1 else: pb20[i] = (B_mul * (1 - disease_sym_matrix.loc[symptom_code[i]]) * patient_number).sum() / (B_mul * patient_number).sum() pb21[i] = 1 - pb20[i] if pb21[i] == 0: H20[i] = 10 H21[i] = 10 else: data_pba20 = B_mul * ( 1 - disease_sym_matrix.loc[symptom_code[i]]) data_mul20 = data_pba20.mul(patient_ratio, axis=0) data_pab20 = data_mul20 / data_mul20.sum() data_pab_d020 = data_pab20[data_pab20 != 0] for j in range(0, len(data_pab_d020)): H20[i] = H20[i] - data_pab_d020[j] * math.log( data_pab_d020[j], disease_number) data_pba21 = B_mul * disease_sym_matrix.loc[ symptom_code[i]] data_mul21 = data_pba21.mul(patient_ratio, axis=0) data_pab21 = data_mul21 / data_mul21.sum() data_pab_d021 = data_pab21[data_pab21 != 0] for j in range(0, len(data_pab_d021)): H21[i] = H21[i] - data_pab_d021[j] * math.log( data_pab_d021[j], disease_number) H20 = H20.fillna(0) H21 = H21.fillna(0) H2 = H20 * pb20 + H21 * pb21 # determine whether end or not. if H2.min() < 1 and int(content["question_seq"]) < 20: # get the next question(corresponding symptom code) and other output information H2_sym = pd.DataFrame({'H2': H2, 'sym': symptom_code}) next_symptom_code_str = H2_sym['sym'][ H2_sym['H2'] == H2_sym['H2'].min()].values[0] next_symptom_code = [] next_symptom_code.append(next_symptom_code_str) result_code = 'SUCCESS' answer_mainseqno = content["answer_mainseqno"] ifend = '0' next_answer_detail = ["是", "否", "不确定"] confirm_disease_code = '' confirm_disease_percent = '' disease_array = [''] errmessage = '' else: #get the model file using joblib forest_clf_file = "forest_clf_1025.pkl" forest_clf = joblib.load( os.path.join(DATA_DIR, forest_clf_file)) gnb_clf_file = "gnb_clf_1025.pkl" gnb_clf = joblib.load(os.path.join(DATA_DIR, gnb_clf_file)) mnb_clf_file = "mnb_clf_1025.pkl" mnb_clf = joblib.load(os.path.join(DATA_DIR, mnb_clf_file)) bnb_clf_file = "bnb_clf_1025.pkl" bnb_clf = joblib.load(os.path.join(DATA_DIR, bnb_clf_file)) #ensemble learning to predict disease Sd = pd.merge(A, SD_symcode, how='right') Sd['answer_detail'] = Sd['answer_detail'].fillna(0) Sd = Sd.sort_values(by="symptom_code", ascending=True) some_digit = np.array(Sd["answer_detail"]) #first classifier:Bayes method,based on probability A_sym = list(A[A["answer_detail"] == 1]["symptom_code"]) sym_num = len(A_sym) Bys_dsm = (disease_sym_matrix.copy()).T Bys_dsm['1'] = 1 Bys_pba = Bys_dsm['1'] for i in range(sym_num): Bys_pba = Bys_pba * Bys_dsm[A_sym[i]] Bys_baa = Bys_pba * patient_ratio Bys_pab = Bys_baa / sum(Bys_baa) bys_clf_predict = disease_code.copy() bys_clf_predict = pd.DataFrame(bys_clf_predict) bys_clf_predict.columns = ['bys_clf.classes'] bys_clf_predict['predict_proba'] = list(Bys_pab) #second classifier:Random Forest forest_clf_predict_proba1 = pd.DataFrame( forest_clf.predict_proba([some_digit])[0], columns=['predict_proba']) forest_clf_classes1 = pd.DataFrame(forest_clf.classes_, columns=['sgd_clf.classes']) forest_clf_predict = pd.merge(forest_clf_classes1, forest_clf_predict_proba1, left_index=True, right_index=True) #third classifier:GaussianNB gnb_clf_predict_proba1 = pd.DataFrame( gnb_clf.predict_proba([some_digit])[0], columns=['predict_proba']) gnb_clf_classes1 = pd.DataFrame(gnb_clf.classes_, columns=['gnd_clf.classes']) gnb_clf_predict = pd.merge(gnb_clf_classes1, gnb_clf_predict_proba1, left_index=True, right_index=True) #fourth:MultinomialNB mnb_clf_predict_proba1 = pd.DataFrame( mnb_clf.predict_proba([some_digit])[0], columns=['predict_proba']) mnb_clf_classes1 = pd.DataFrame(mnb_clf.classes_, columns=['mnd_clf.classes']) mnb_clf_predict = pd.merge(mnb_clf_classes1, mnb_clf_predict_proba1, left_index=True, right_index=True) #fifth:BernoulliNB bnb_clf_predict_proba1 = pd.DataFrame( bnb_clf.predict_proba([some_digit])[0], columns=['predict_proba']) bnb_clf_classes1 = pd.DataFrame(bnb_clf.classes_, columns=['bnd_clf.classes']) bnb_clf_predict = pd.merge(bnb_clf_classes1, bnb_clf_predict_proba1, left_index=True, right_index=True) #ensemble soft voting ensemble_proba = (1 / 5) * bys_clf_predict['predict_proba'] + ( 3 / 5) * forest_clf_predict['predict_proba'] + ( 1 / 15) * gnb_clf_predict['predict_proba'] + ( 1 / 15) * mnb_clf_predict['predict_proba'] + ( 1 / 15) * bnb_clf_predict['predict_proba'] esm_clf_predict = disease_code.copy() esm_clf_predict = pd.DataFrame(esm_clf_predict) esm_clf_predict.columns = ['esm_clf.classes'] esm_clf_predict['predict_proba'] = ensemble_proba * 0.98 esm_clf_predict = esm_clf_predict.sort_values( by="predict_proba", ascending=False) #extract outcome result_code = 'SUCCESS' answer_mainseqno = content["answer_mainseqno"] ifend = '1' next_symptom_code = [''] next_answer_detail = [''] #need to calculate the similarity confirm_disease_code = str( list(esm_clf_predict['esm_clf.classes'])[0]) confirm_disease_percent = str( list(esm_clf_predict['predict_proba'])[0]) disease_array = [{ "disease_code": str(list(esm_clf_predict['esm_clf.classes'])[0]), "disease_percent": str(list(esm_clf_predict['predict_proba'])[0]) }, { "disease_code": str(list(esm_clf_predict['esm_clf.classes'])[1]), "disease_percent": str(list(esm_clf_predict['predict_proba'])[1]) }, { "disease_code": str(list(esm_clf_predict['esm_clf.classes'])[2]), "disease_percent": str(list(esm_clf_predict['predict_proba'])[2]) }] errmessage = '' cache.delete(('global_dict' + content["answer_mainseqno"])) del forest_clf, gnb_clf, mnb_clf, bnb_clf python_output = { "resultcode": result_code, "answer_record_seqno": jsondata["answer_record_seqno"], "returnmessage": { "answer_mainseqno": answer_mainseqno, "ifend": str(ifend), "next_symptom_code": next_symptom_code, "next_answer_detail": next_answer_detail, "confirm_disease_code": confirm_disease_code, "confirm_disease_percent": str(confirm_disease_percent), "disease_array": disease_array, "disease_case_number": '' }, "errmessage": errmessage } #print('-'*10 + 'getanswer end') # interface 2--cancelanswer elif jsondata["pathtype"] == "cancelanswer": #print('-'*10 + 'cancelanswer') if not global_dict: result_code = 'FAIL' answer_mainseqno = '' errmessage = '整体会话主键不存在' else: global_dict['A' + content["answer_mainseqno"]].iloc[( int(content["question_seq"]) - 1), 0] = '0' global_dict['A' + content["answer_mainseqno"]].iloc[( int(content["question_seq"]) - 1), 1] = 0 global_dict['B' + content["answer_mainseqno"]][int( content["question_seq"])] = 1 result_code = 'SUCCESS' answer_mainseqno = content["answer_mainseqno"] errmessage = '' cache.set(('global_dict' + content["answer_mainseqno"]), global_dict, timeout=86400) python_output = { "resultcode": result_code, "answer_record_seqno": jsondata["answer_record_seqno"], "answer_mainseqno": answer_mainseqno, "errmessage": errmessage } #print('-'*10 + 'cancelanswer end') # output the json string del patient_number_data, patient_ratio_data, disease_sym_matrix json_output = json.dumps(python_output) #print(json_output) return JsonResponse(python_output, safe=False)
def target_discrete_price_variation(pct_var: pd.Series, **kwargs): classes = to_discrete_double(pct_var.fillna(method='ffill'), -0.01, 0.01) return pd.Series(classes, index=pct_var.index)
def test_datetime64_tz_fillna(self): for tz in ['US/Eastern', 'Asia/Tokyo']: # DatetimeBlock s = Series([ Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-03 10:00'), pd.NaT ]) result = s.fillna(pd.Timestamp('2011-01-02 10:00')) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-02 10:00') ]) self.assert_series_equal(expected, result) result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-02 10:00', tz=tz) ]) self.assert_series_equal(expected, result) result = s.fillna('AAA') expected = Series([ Timestamp('2011-01-01 10:00'), 'AAA', Timestamp('2011-01-03 10:00'), 'AAA' ], dtype=object) self.assert_series_equal(expected, result) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00'), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00'), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) # DatetimeBlockTZ idx = pd.DatetimeIndex( ['2011-01-01 10:00', pd.NaT, '2011-01-03 10:00', pd.NaT], tz=tz) s = pd.Series(idx) result = s.fillna(pd.Timestamp('2011-01-02 10:00')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-02 10:00') ]) self.assert_series_equal(expected, result) result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) idx = pd.DatetimeIndex([ '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00', '2011-01-02 10:00' ], tz=tz) expected = Series(idx) self.assert_series_equal(expected, result) result = s.fillna( pd.Timestamp('2011-01-02 10:00', tz=tz).to_pydatetime()) idx = pd.DatetimeIndex([ '2011-01-01 10:00', '2011-01-02 10:00', '2011-01-03 10:00', '2011-01-02 10:00' ], tz=tz) expected = Series(idx) self.assert_series_equal(expected, result) result = s.fillna('AAA') expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), 'AAA', Timestamp('2011-01-03 10:00', tz=tz), 'AAA' ], dtype=object) self.assert_series_equal(expected, result) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00') }) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-04 10:00') ]) self.assert_series_equal(expected, result) result = s.fillna({ 1: pd.Timestamp('2011-01-02 10:00', tz=tz), 3: pd.Timestamp('2011-01-04 10:00', tz=tz) }) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2011-01-02 10:00', tz=tz), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2011-01-04 10:00', tz=tz) ]) self.assert_series_equal(expected, result) # filling with a naive/other zone, coerce to object result = s.fillna(Timestamp('20130101')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2013-01-01'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2013-01-01') ]) self.assert_series_equal(expected, result) result = s.fillna(Timestamp('20130101', tz='US/Pacific')) expected = Series([ Timestamp('2011-01-01 10:00', tz=tz), Timestamp('2013-01-01', tz='US/Pacific'), Timestamp('2011-01-03 10:00', tz=tz), Timestamp('2013-01-01', tz='US/Pacific') ]) self.assert_series_equal(expected, result)
def target_binary_price_variation(pct_var: pd.Series, **kwargs): classes = to_discrete_single(pct_var.fillna(method='ffill'), 0.00) return pd.Series(classes, index=pct_var.index)
def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) with pytest.raises(ValueError, match="fill value must be in categories"): ser.fillna("d") with pytest.raises(ValueError, match="fill value must be in categories"): ser.fillna(Series("d")) with pytest.raises(ValueError, match="fill value must be in categories"): ser.fillna({1: "d", 3: "a"}) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): ser.fillna(["a", "b"]) msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): ser.fillna(("a", "b")) msg = ( '"value" parameter must be a scalar, dict ' 'or Series, but you passed a "DataFrame"' ) with pytest.raises(TypeError, match=msg): ser.fillna(DataFrame({1: ["a"], 3: ["b"]}))
class ESRIAsc: def __init__(self, file_path=None, ncols=None, nrows=None, xllcorner=None, yllcorner=None, cellsize=1, NODATA_value=-9999, data=None): self.file_path = file_path self.ncols = ncols self.nrows = nrows self.xllcorner = xllcorner self.yllcorner = yllcorner self.cellsize = cellsize self.NODATA_value = NODATA_value self.data = data # if a file is provided, the file metadata will overwrite any # user-provided kwargs if file_path: getnextval = lambda f: f.readline().strip().split()[1] f = open(file_path, 'r') self.ncols = int(getnextval(f)) self.nrows = int(getnextval(f)) self.xllcorner = float(getnextval(f)) self.yllcorner = float(getnextval(f)) self.cellsize = int(getnextval(f)) self.NODATA_value = float(getnextval(f)) # should not be necessary for well-formed ESRI files, but # seems to be for CASiMiR data_str = ' '.join([l.strip() for l in f.readlines()]) self.data = Series(fromstring(data_str, dtype=float, sep=' ')) colrow_prod = self.nrows*self.ncols assert len(self.data) == colrow_prod, \ "length of .asc data does not equal product of ncols * nrows" \ "\nncols: {}, nrows: {}, ncols*nrows: {} len(data): {}".format( self.ncols, self.nrows, colrow_prod, len(self.data)) def as_matrix(self, replace_nodata_val=None): """ Convenience method to give 2D numpy.ndarray representation. If replace_nodata_val is given, replace all NODATA_value entries with it. Arguments: replace_nodata_val (float): value with which to replace NODATA_value entries Returns: (numpy.ndarray) matrix representation of the data in the .asc """ ret = copy.copy(reshape(self.data, (self.nrows, self.ncols))) if replace_nodata_val is not None: ret[ret == self.NODATA_value] = replace_nodata_val return ret def write(self, write_path): # replace nan with NODATA_value self.data = self.data.fillna(self.NODATA_value) with open(write_path, 'w+') as f: f.write("ncols {}\n".format(self.ncols)) f.write("nrows {}\n".format(self.nrows)) f.write("xllcorner {}\n".format(self.xllcorner)) f.write("yllcorner {}\n".format(self.yllcorner)) f.write("cellsize {}\n".format(self.cellsize)) f.write("NODATA_value {}\n".format(self.NODATA_value)) # prob not most efficient, but CASiMiR requires # ESRI Ascii w/ newlines f.write( '\n'.join( [ ' '.join([str(v) for v in row]) for row in self.as_matrix() ] ) ) def __eq__(self, other): if isinstance(other, ESRIAsc): ret = self.ncols == other.ncols ret = self.nrows == other.nrows and ret ret = self.xllcorner == other.xllcorner and ret ret = self.yllcorner == other.yllcorner and ret ret = self.cellsize == other.cellsize and ret ret = self.NODATA_value == other.NODATA_value and ret ret = all(self.data == other.data) and ret return ret return NotImplemented