def test_from_weekly_resampling(self): idxh = date_range('1/1/1999', periods=52, freq='W') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) low.plot() ax = high.plot() expected_h = idxh.to_period().asi8.astype(np.float64) expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562], dtype=np.float64) for l in ax.get_lines(): self.assertTrue(PeriodIndex(data=l.get_xdata()).freq, idxh.freq) xdata = l.get_xdata(orig=False) if len(xdata) == 12: # idxl lines self.assert_numpy_array_equal(xdata, expected_l) else: self.assert_numpy_array_equal(xdata, expected_h) tm.close() # tsplot from pandas.tseries.plotting import tsplot import matplotlib.pyplot as plt tsplot(low, plt.Axes.plot) lines = tsplot(high, plt.Axes.plot) for l in lines: self.assertTrue(PeriodIndex(data=l.get_xdata()).freq, idxh.freq) xdata = l.get_xdata(orig=False) if len(xdata) == 12: # idxl lines self.assert_numpy_array_equal(xdata, expected_l) else: self.assert_numpy_array_equal(xdata, expected_h)
def test_coercion_with_loc_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: start_series = Series(start_data) start_series.loc[start_series == start_series[0]] = None expected_series = Series(expected_result) tm.assert_series_equal(start_series, expected_series)
def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') ts = Series(np.random.randn(len(rng)), index=rng) ts_utc = ts.tz_localize('utc') self.assertRaises(Exception, ts.__add__, ts_utc) self.assertRaises(Exception, ts_utc.__add__, ts) test1 = DataFrame(np.zeros((6,3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3,3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=range(3,6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) self.assertTrue(result.index.equals(ex_index)) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) self.assertTrue(result.tz.zone == 'UTC')
def _wrap_results(result, dtype): """ wrap our results if needed """ if issubclass(dtype.type, np.datetime64): if not isinstance(result, np.ndarray): result = lib.Timestamp(result) else: result = result.view(dtype) elif issubclass(dtype.type, np.timedelta64): if not isinstance(result, np.ndarray): # this is a scalar timedelta result! # we have series convert then take the element (scalar) # as series will do the right thing in py3 (and deal with numpy # 1.6.2 bug in that it results dtype of timedelta64[us] from pandas import Series # coerce float to results if is_float(result): result = int(result) result = Series([result], dtype='timedelta64[ns]') else: result = result.view(dtype) return result
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result result = np.asarray(result) # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def test_get(): # GH 6383 s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, 51, 39, 55, 43, 54, 52, 51, 54])) result = s.get(25, 0) expected = 0 assert result == expected s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, 51, 39, 55, 43, 54, 52, 51, 54]), index=pd.Float64Index( [25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0], dtype='object')) result = s.get(25, 0) expected = 43 assert result == expected # GH 7407 # with a boolean accessor df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3}) vc = df.i.value_counts() result = vc.get(99, default='Missing') assert result == 'Missing' vc = df.b.value_counts() result = vc.get(False, default='Missing') assert result == 3 result = vc.get(True, default='Missing') assert result == 'Missing'
def test_tz_aware_asfreq(self): dr = date_range('2011-12-01','2012-07-20',freq = 'D', tz = 'US/Eastern') s = Series(np.random.randn(len(dr)), index=dr) # it works! s.asfreq('T')
def test_mixed_freq_irreg_period(self): ts = tm.makeTimeSeries() irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] rng = period_range('1/3/2000', periods=30, freq='B') ps = Series(np.random.randn(len(rng)), rng) irreg.plot() ps.plot()
def test_tz_aware_asfreq(self): dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=self.tzstr("US/Eastern")) s = Series(np.random.randn(len(dr)), index=dr) # it works! s.asfreq("T")
def test_constructor(self): assert self.ts.index.is_all_dates # Pass in Series derived = Series(self.ts) assert derived.index.is_all_dates assert tm.equalContents(derived.index, self.ts.index) # Ensure new index is not created assert id(self.ts.index) == id(derived.index) # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not self.empty.index.is_all_dates assert not Series({}).index.is_all_dates pytest.raises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) mixed.name = 'Series' rs = Series(mixed).name xp = 'Series' assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m)
def quarter_plot(x, dates=None, ylabel=None, ax=None): """ Seasonal plot of quarterly data Parameters ---------- x : array-like Seasonal data to plot. If dates is None, x must be a pandas object with a PeriodIndex or DatetimeIndex with a monthly frequency. dates : array-like, optional If `x` is not a pandas object, then dates must be supplied. ylabel : str, optional The label for the y-axis. Will attempt to use the `name` attribute of the Series. ax : matplotlib.axes, optional Existing axes instance. Returns ------- matplotlib.Figure """ from pandas import DataFrame if dates is None: from statsmodels.tools.data import _check_period_index _check_period_index(x, freq="Q") else: from pandas import Series, PeriodIndex x = Series(x, index=PeriodIndex(dates, freq="Q")) xticklabels = ['q1', 'q2', 'q3', 'q4'] return seasonal_plot(x.groupby(lambda y : y.quarter), xticklabels, ylabel=ylabel, ax=ax)
def test_repr_unicode(self): s = Series([u'\u03c3'] * 10) repr(s) a = Series([u"\u05d0"] * 1000) a.name = 'title1' repr(a)
def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index())
def test_getitem_setitem_datetime_tz_pytz(self): tm._skip_if_no_pytz() from pytz import timezone as tz from pandas import date_range N = 50 # testing with timezone, GH #2785 rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 result = ts.copy() result["1990-01-01 09:00:00+00:00"] = 0 result["1990-01-01 09:00:00+00:00"] = ts[4] assert_series_equal(result, ts) result = ts.copy() result["1990-01-01 03:00:00-06:00"] = 0 result["1990-01-01 03:00:00-06:00"] = ts[4] assert_series_equal(result, ts) # repeat with datetimes result = ts.copy() result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] assert_series_equal(result, ts) result = ts.copy() # comparison dates with datetime MUST be localized! date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) result[date] = 0 result[date] = ts[4] assert_series_equal(result, ts)
def test_shift_dst(self): # GH 13926 dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') s = Series(dates) res = s.shift(0) tm.assert_series_equal(res, s) self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') res = s.shift(1) exp_vals = [NaT] + dates.asobject.values.tolist()[:9] exp = Series(exp_vals) tm.assert_series_equal(res, exp) self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') res = s.shift(-2) exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT] exp = Series(exp_vals) tm.assert_series_equal(res, exp) self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') for ex in [10, -10, 20, -20]: res = s.shift(ex) exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]') tm.assert_series_equal(res, exp) self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]')
def test_all_values_single_bin(self): # 2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.randn(len(index)), index=index) result = s.resample("A", how='mean') tm.assert_almost_equal(result[0], s.mean())
def test_tshift(self): # PeriodIndex ps = tm.makePeriodSeries() shifted = ps.tshift(1) unshifted = shifted.tshift(-1) assert_series_equal(unshifted, ps) shifted2 = ps.tshift(freq='B') assert_series_equal(shifted, shifted2) shifted3 = ps.tshift(freq=datetools.bday) assert_series_equal(shifted, shifted3) self.assertRaises(ValueError, ps.tshift, freq='M') # DatetimeIndex shifted = self.ts.tshift(1) unshifted = shifted.tshift(-1) assert_series_equal(self.ts, unshifted) shifted2 = self.ts.tshift(freq=self.ts.index.freq) assert_series_equal(shifted, shifted2) inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index)), name='ts') shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) assert_series_equal(shifted, self.ts.tshift(1)) assert_series_equal(unshifted, inferred_ts) no_freq = self.ts[[0, 5, 7]] self.assertRaises(ValueError, no_freq.tshift)
def test_upsample_with_limit(self): rng = date_range('1/1/2000', periods=3, freq='5t') ts = Series(np.random.randn(len(rng)), rng) result = ts.resample('t', fill_method='ffill', limit=2) expected = ts.reindex(result.index, method='ffill', limit=2) assert_series_equal(result, expected)
def test_quarterly_resampling(self): rng = period_range('2000Q1', periods=10, freq='Q-DEC') ts = Series(np.arange(10), index=rng) result = ts.resample('A') exp = ts.to_timestamp().resample('A').to_period() assert_series_equal(result, exp)
def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' assert rng1.append(rng1).name == 'foo' assert rng1.append(rng2).name is None
def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() df['E'] = True df['F'] = 1 # tests for first / last / nth grouped = df.groupby('A') first = grouped.first() expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last() expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1) expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = lrange(10) idx.append(9) s = Series(data=lrange(11), index=idx, name='IntCol') assert s.dtype == 'int64' f = s.groupby(level=0).first() assert f.dtype == 'int64'
def test_nat_operations(): # GH 8617 s = Series([0, pd.NaT], dtype='m8[ns]') exp = s[0] assert s.median() == exp assert s.min() == exp assert s.max() == exp
def test_endswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') xp = [False, NA, False, NA, NA, False, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.endswith('f') tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, u('foo')]) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith('foo', na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_valid_dt_with_missing_values(self): from datetime import date, time # GH 8689 s = Series(date_range('20130101', periods=5, freq='D')) s.iloc[2] = pd.NaT for attr in ['microsecond', 'nanosecond', 'second', 'minute', 'hour', 'day']: expected = getattr(s.dt, attr).copy() expected.iloc[2] = np.nan result = getattr(s.dt, attr) tm.assert_series_equal(result, expected) result = s.dt.date expected = Series( [date(2013, 1, 1), date(2013, 1, 2), np.nan, date(2013, 1, 4), date(2013, 1, 5)], dtype='object') tm.assert_series_equal(result, expected) result = s.dt.time expected = Series( [time(0), time(0), np.nan, time(0), time(0)], dtype='object') tm.assert_series_equal(result, expected)
def shiftTs(): dates = [datetime(2014,1,2), datetime(2014,1,3), datetime(2014,1,4), datetime(2014,1,5)] ts1 = Series(np.arange(4)+2, index=dates) #ts1 = ts1/ts1.shift(1) - 1 print (ts1) ts1 = ts1.shift(1, freq='M') print (ts1)
def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit='ms') s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) self.assertEqual(frame[0].dtype, 'timedelta64[ns]') assert_frame_equal(frame, pd.read_json(frame.to_json()) .apply(converter)) frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)], 'b': [1, 2], 'c': pd.date_range(start='20130101', periods=2)}) result = pd.read_json(frame.to_json(date_unit='ns')) result['a'] = pd.to_timedelta(result.a, unit='ns') result['c'] = pd.to_datetime(result.c) assert_frame_equal(frame, result)
def test_mixed_index_assignment(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) s.at['a'] = 11 assert s.iat[0] == 11 s.at[1] = 22 assert s.iat[3] == 22
def test_constructor_with_datetimelike(self): # 12077 # constructor wwth a datetimelike and NaT for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'), date_range('1995-01-01 00:00:00', periods=5, freq='s', tz='US/Eastern'), timedelta_range('1 day', periods=5, freq='s')]: s = Series(dtl) c = Categorical(s) expected = type(dtl)(s) expected.freq = None tm.assert_index_equal(c.categories, expected) tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8')) # with NaT s2 = s.copy() s2.iloc[-1] = NaT c = Categorical(s2) expected = type(dtl)(s2.dropna()) expected.freq = None tm.assert_index_equal(c.categories, expected) exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) tm.assert_numpy_array_equal(c.codes, exp) result = repr(c) assert 'NaT' in result
def test_annual_upsample(self): targets = ['D', 'B', 'M'] for month in MONTHS: ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-%s' % month) for targ, conv, meth in product(targets, ['start', 'end'], ['ffill', 'bfill']): result = ts.resample(targ, fill_method=meth, convention=conv) expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, meth).to_period() assert_series_equal(result, expected) df = DataFrame({'a' : ts}) rdf = df.resample('D', fill_method='ffill') exp = df['a'].resample('D', fill_method='ffill') assert_series_equal(rdf['a'], exp) rng = period_range('2000', '2003', freq='A-DEC') ts = Series([1, 2, 3, 4], index=rng) result = ts.resample('M', fill_method='ffill') ex_index = period_range('2000-01', '2003-12', freq='M') expected = ts.asfreq('M', how='start').reindex(ex_index, method='ffill') assert_series_equal(result, expected)
def test_astype(self): s = Series(np.random.randn(5), name='foo') for dtype in ['float32', 'float64', 'int64', 'int32']: astyped = s.astype(dtype) self.assertEqual(astyped.dtype, dtype) self.assertEqual(astyped.name, s.name)
def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) assert s.dropna().sum("rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index"
def test_cat_accessor_updates_on_inplace(self): s = Series(list("abc")).astype("category") s.drop(0, inplace=True) s.cat.remove_unused_categories(inplace=True) assert len(s.cat.categories) == 2
def test_cat_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 c = Series(list("aabbcde")).astype("category") with pytest.raises(AttributeError, match="You cannot add any new attribute"): c.cat.xlabel = "a"
def test_getname_categorical_accessor(self, method): # GH 17509 s = Series([1, 2, 3], name="A").astype("category") expected = "A" result = method(s).name assert result == expected
def test_get_values_deprecation(self): s = Series(range(9)) with tm.assert_produces_warning(FutureWarning): res = s.get_values() tm.assert_numpy_array_equal(res, s.values)
def test_integer_series_size(self): # GH 25580 s = Series(range(9)) assert s.size == 9 s = Series(range(9), dtype="Int64") assert s.size == 9
def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame( np.random.randn(1000, 3), columns=["A", "B", "C"], index=date_range("1/1/2000", periods=1000), ) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() with tm.assert_produces_warning(FutureWarning): s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) # compress # GH 6658 s = Series([0, 1.0, -1], index=list("abc")) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=["b"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype="float64", index=Index([], dtype="object")) tm.assert_series_equal(result, exp) s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=[0.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype="float64", index=Index([], dtype="float64")) tm.assert_series_equal(result, exp)
columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print(frame2.ix['three']) # 一行值 print(frame2) print(frame2['state']) # 获取一列值,结果为一个series print(frame2.ix['one']) print(frame2.year) print(frame2['pop'].ix['three']) frame2['debt'] = 16.5 # 修改一整列 print(frame2) frame2.debt = np.arange(5) # 用numpy数组修改元素 print(frame2) print("-" * 40) #用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。' val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print(frame2) print("-" * 40) # # #赋值给新列' # frame2['eastern'] = (frame2.state == 'beijing') # 如果state等于beijing为True # print(frame2) # print(frame2.columns) # print("-"*40) # # #DataFrame转置' pop = { 'shangsha': { 2001: 2.4, 2002: 2.9
def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) cat_rev_base = Series( Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True)) s = Series(base) a = np.array(base) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = Series([True, False, False]) tm.assert_series_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = Series([False, False, True]) tm.assert_series_equal(res_rev, exp_rev) res = cat > cat_base exp = Series([False, False, True]) tm.assert_series_equal(res, exp) scalar = base[1] res = cat > scalar exp = Series([False, False, True]) exp2 = cat.values > scalar tm.assert_series_equal(res, exp) tm.assert_numpy_array_equal(res.values, exp2) res_rev = cat_rev > scalar exp_rev = Series([True, False, False]) exp_rev2 = cat_rev.values > scalar tm.assert_series_equal(res_rev, exp_rev) tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared with pytest.raises(TypeError): cat > cat_rev # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ("Cannot compare a Categorical for op __gt__ with type" r" <class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev with pytest.raises(TypeError, match=msg): a < cat with pytest.raises(TypeError, match=msg): a < cat_rev
def test_raise_on_info(self): s = Series(np.random.randn(10)) msg = "'Series' object has no attribute 'info'" with pytest.raises(AttributeError, match=msg): s.info()
class TestCategoricalOps: def test_compare_frame(self): # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame data = ["a", "b", 2, "a"] cat = Categorical(data) df = DataFrame(cat) for op in [operator.eq, operator.ne, operator.ge, operator.gt, operator.le, operator.lt]: with pytest.raises(ValueError): # alignment raises unless we transpose op(cat, df) result = cat == df.T expected = DataFrame([[True, True, True, True]]) tm.assert_frame_equal(result, expected) result = cat[::-1] != df.T expected = DataFrame([[False, True, True, False]]) tm.assert_frame_equal(result, expected) def test_datetime_categorical_comparison(self): dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True) tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = Categorical([1, 2, 3], ordered=True) tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 # and following comparisons with scalars not in categories should raise # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) msg = ("Cannot compare a Categorical for op __{}__ with a scalar," " which is not a category") with pytest.raises(TypeError, match=msg.format('lt')): cat < 4 with pytest.raises(TypeError, match=msg.format('gt')): cat > 4 with pytest.raises(TypeError, match=msg.format('gt')): 4 < cat with pytest.raises(TypeError, match=msg.format('lt')): 4 > cat tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) actual = getattr(cat, compare_operators_no_eq_ne)(other) tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] ) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) cat_rev_base = Series( Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True)) s = Series(base) a = np.array(base) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = Series([True, False, False]) tm.assert_series_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = Series([False, False, True]) tm.assert_series_equal(res_rev, exp_rev) res = cat > cat_base exp = Series([False, False, True]) tm.assert_series_equal(res, exp) scalar = base[1] res = cat > scalar exp = Series([False, False, True]) exp2 = cat.values > scalar tm.assert_series_equal(res, exp) tm.assert_numpy_array_equal(res.values, exp2) res_rev = cat_rev > scalar exp_rev = Series([True, False, False]) exp_rev2 = cat_rev.values > scalar tm.assert_series_equal(res_rev, exp_rev) tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared with pytest.raises(TypeError): cat > cat_rev # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ("Cannot compare a Categorical for op __gt__ with type" r" <class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev with pytest.raises(TypeError, match=msg): a < cat with pytest.raises(TypeError, match=msg): a < cat_rev @pytest.mark.parametrize('ctor', [ lambda *args, **kwargs: Categorical(*args, **kwargs), lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), ]) def test_unordered_different_order_equal(self, ctor): # https://github.com/pandas-dev/pandas/issues/16014 c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) assert (c1 == c2).all() c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) assert (c1 != c2).all() c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) assert (c1 != c2).all() c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) result = c1 == c2 tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) def test_unordered_different_categories_raises(self): c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) with pytest.raises(TypeError, match=("Categoricals can " "only be compared")): c1 == c2 def test_compare_different_lengths(self): c1 = Categorical([], categories=['a', 'b']) c2 = Categorical([], categories=['a']) msg = "Categories are different lengths" with pytest.raises(TypeError, match=msg): c1 == c2 def test_compare_unordered_different_order(self): # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- # 349290078 a = pd.Categorical(['a'], categories=['a', 'b']) b = pd.Categorical(['b'], categories=['b', 'a']) assert not a.equals(b) def test_numeric_like_ops(self): df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) # numeric ops should not succeed for op, str_rep in [('__add__', r'\+'), ('__sub__', '-'), ('__mul__', r'\*'), ('__truediv__', '/')]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) s = df['value_group'] for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']: msg = "Categorical cannot perform the operation {}".format(op) with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) # mad technically works because it takes always the numeric data # numpy ops s = Series(Categorical([1, 2, 3, 4])) with pytest.raises(TypeError): np.sum(s) # numeric ops on a Series for op, str_rep in [('__add__', r'\+'), ('__sub__', '-'), ('__mul__', r'\*'), ('__truediv__', '/')]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) # invalid ufunc with pytest.raises(TypeError): np.log(s) def test_contains(self): # GH21508 c = pd.Categorical(list('aabbca'), categories=list('cab')) assert 'b' in c assert 'z' not in c assert np.nan not in c with pytest.raises(TypeError): assert [1] in c # assert codes NOT in index assert 0 not in c assert 1 not in c c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) assert np.nan in c
def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return Series(diff)
def test_clip_with_na_args(self): """Should process np.nan argument as None""" # GH#17276 s = Series([1, 2, 3]) tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3])) tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1])) # GH#40420 s = Series([1, 2, 3]) result = s.clip(0, [np.nan, np.nan, np.nan]) tm.assert_series_equal(s, result)
def test_comparisons(self): result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor != 'a'] expected = self.factor[np.asarray(self.factor) != 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor < 'c'] expected = self.factor[np.asarray(self.factor) < 'c'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor > 'a'] expected = self.factor[np.asarray(self.factor) > 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor >= 'b'] expected = self.factor[np.asarray(self.factor) >= 'b'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor <= 'b'] expected = self.factor[np.asarray(self.factor) <= 'b'] tm.assert_categorical_equal(result, expected) n = len(self.factor) other = self.factor[np.random.permutation(n)] result = self.factor == other expected = np.asarray(self.factor) == np.asarray(other) tm.assert_numpy_array_equal(result, expected) result = self.factor == 'd' expected = np.repeat(False, len(self.factor)) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals cat_rev = Categorical( ["a", "b", "c"], categories=["c", "b", "a"], ordered=True) cat_rev_base = Categorical( ["b", "b", "b"], categories=["c", "b", "a"], ordered=True) cat = Categorical(["a", "b", "c"], ordered=True) cat_base = Categorical( ["b", "b", "b"], categories=cat.categories, ordered=True) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = np.array([True, False, False]) tm.assert_numpy_array_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = np.array([False, False, True]) tm.assert_numpy_array_equal(res_rev, exp_rev) res = cat > cat_base exp = np.array([False, False, True]) tm.assert_numpy_array_equal(res, exp) # Only categories with same categories can be compared with pytest.raises(TypeError): cat > cat_rev cat_rev_base2 = Categorical( ["b", "b", "b"], categories=["c", "b", "a", "d"]) with pytest.raises(TypeError): cat_rev > cat_rev_base2 # Only categories with same ordering information can be compared cat_unorderd = cat.set_ordered(False) assert not (cat > cat).any() with pytest.raises(TypeError): cat > cat_unorderd # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) msg = ("Cannot compare a Categorical for op __gt__ with type" r" <class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev # comparison with numpy.array will raise in both direction, but only on # newer numpy versions a = np.array(["b", "b", "b"]) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a # Make sure that unequal comparison take the categories order in # account cat_rev = Categorical( list("abc"), categories=list("cba"), ordered=True) exp = np.array([True, False, False]) res = cat_rev > "b" tm.assert_numpy_array_equal(res, exp) # check that zero-dim array gets unboxed res = cat_rev > np.array("b") tm.assert_numpy_array_equal(res, exp)
def _unary_op(op, this, null_value=False): # type: (str, GeoSeries, Any) -> Series """Unary operation that returns a Series""" return Series([getattr(geom, op, null_value) for geom in this.geometry], index=this.index, dtype=np.dtype(type(null_value)))
def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) g = df.groupby('A') expected = g.sum() g = df.groupby(pd.Grouper(key='A')) result = g.sum() assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key='A', axis=0)) result = g.sum() assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame({ 'A': [0, 0, 0, 1, 1, 1], 'B': [1, 1, 2, 2, 3, 3], 'C': [1, 2, 3, 4, 5, 6] }) # Group by single column expected = df.groupby('A').sum() g = df.groupby([pd.Grouper(key='A')]) result = g.sum() assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(['A', 'B']).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(['A', pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key='A'), 'B']) result = g.sum() assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype='int64'), index=pd.MultiIndex.from_product( [list('ab'), range(2), date_range('20130101', periods=2)], names=['one', 'two', 'three'])) result = s.groupby(pd.Grouper(level='three', freq='M')).sum() expected = Series([28], index=Index([Timestamp('2013-01-31')], freq='M', name='three')) assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level='one')).sum() expected = s.groupby(level='one').sum() assert_series_equal(result, expected)
def detect(data, args): in_file = data['r2_path'] out_prefix = data['sample_id'] out_file = out_prefix + "_polyA.dat.gz" out_name_false = out_prefix + "_none.dat.gz" counts = Counter() num_line = 0 logger.my_logger.info("reading file %s" % in_file) logger.my_logger.info("creating files %s %s" % (out_file, out_name_false)) data['detect'] = out_file if os.path.exists(out_file): return data with file_transaction(out_file) as tx_out_file: with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open( out_name_false, 'w') as out_false: for line in handle: #print line num_line += 1 if num_line % 1000000 == 0: logger.my_logger.info("read %s lines:" % num_line) if line.startswith("@HISEQ"): #print line name = line.strip() seq = handle.next().strip() handle.next().strip() qual = handle.next().strip() find = _adapter(seq, qual) #print "%s %s" % (seq, find) if find: seq, qual = find ns = poly_A_percentage(seq) #ns = polyA(seq) if ns: if ns[1] - ns[0] >= 6: #print "positions are" + str(ns[0]) + ".." + str(ns[1]) mod = seq[:ns[0]] seq_polyA = seq[ns[0]:ns[1]] seq_gene = seq[ns[1]:] qual_polyA = qual[ns[0]:ns[1]] qual_gene = qual[ns[1]:] #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf) out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene)) counts['polyA'] += 1 if len(mod) > 0: counts['mod'] += 1 else: counts['shortA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual)) else: counts['noA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual)) else: out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual)) counts['notag'] += 1 with file_transaction(out_prefix + ".stat") as tx_stat_file: df = Series(counts) df.to_csv(tx_stat_file, sep="\t") logger.my_logger.info("%s" % counts) return data
def is_ring(self): """Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for features that are closed.""" # operates on the exterior, so can't use _unary_op() return Series([geom.exterior.is_ring for geom in self.geometry], index=self.index)
#def clus_name(clus_num): # clus_map = {0: "a", 1: "b", 2: "c",3: "d"} # return clus_map[clus_num] # #df_clus['clus_profile']=df_clus['cluster'].apply(lambda row: clus_name(row)) with open('us_counties.topo.json') as json_data: d = json.load(json_data) county_list=[] for i in range(0,len(d['objects']['us_counties.geo']['geometries'])): county_list.append(d['objects']['us_counties.geo']['geometries'][i]['properties']['FIPS']) county_list=Series(county_list) county_list=DataFrame(county_list,columns=["FIPS"]) df_counties=pd.merge(county_list, df_clus, left_on="FIPS", right_on="FIPS",how='left') df_counties['cluster']=df_counties['cluster'].fillna(-1) df_counties['cluster']=df_counties['cluster']+1 os.chdir('C:\Users\Vaishnavi\Documents\Notebooks') df_output= df_counties[['FIPS','cluster']] df_output=df_output.sort_values(by='FIPS') df_output.to_csv('Cluster_counties.csv', sep=',')
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
ns = len(y) weights = np.ones(ns) * mu for k in range(ns): weights[k] = weights[k]**k weights = np.flip(weights, 0) # Fitting SVR to the dataset from sklearn.linear_model import LinearRegression lr = AR() #lr.fit(x, y, sample_weight=weights) lr.fit(y) y_pred = lr.predict(pred) return y_pred series = Series.from_csv('daily-minimum-temperatures.csv', header=0) # split dataset X = series.values train, test = X[1:len(X) - 7], X[len(X) - 7:] # train autoregression model = AR(train) model_fit = model.fit() print('Lag: %s' % model_fit.k_ar) print('Coefficients: %s' % model_fit.params) # make predictions predictions = model_fit.predict(start=len(train), end=len(train) + len(test) - 1, dynamic=False) for i in range(len(predictions)): print('predicted=%f, expected=%f' % (predictions[i], test[i])) error = mean_squared_error(test, predictions)
WHERE date = %(date)s ''' # ----- Run query games = read_sql(query, db, params={'date': today}) # ----- Filter to games that haven't started games = games[[ item.hour > dt.datetime.now().hour for item in games['game_time'] ]] # ----- Only continue if games exist if games.shape[0] != 0: lines = df_ml.merge(games, on='id') # ----- Find the best book to place a home and an away bet at: away_books = [item for item in lines.columns if 'odds_a' in item] away_best, away_sports_book, away_odds = list(), list(), list() for game in lines[away_books].iterrows(): values = add_lines(Series(game[1])) away_best.append(max(values)) away_sports_book.append(away_books[values.index( max(values))].replace('odds_a', '')) away_odds.append(game[1].values[values.index(max(values))]) home_books = [item for item in lines.columns if 'odds_h' in item] home_best, home_sports_book, home_odds = list(), list(), list() for game in lines[home_books].iterrows(): values = add_lines(Series(game[1])) home_best.append(max(values)) home_sports_book.append(home_books[values.index( max(values))].replace('odds_h', '')) home_odds.append(game[1].values[values.index(max(values))]) final_lines = lines[[ 'id', 'away_chance_winning', 'home_chance_winning'
for i in range(3, 10): print(i) #遍历字符串 for letter in 'python': print('现在是:', letter) #遍历列表 fruits = ['banana', 'apple', 'mango'] for fruit in fruits: print('now is:', fruit) #遍历序列 x = Series(['a', True, 1], index=['frist', 'second', 'third']) x[0] x['second'] x[2] for i in x.values: print(i) for v in x: print("x中的值:", v) for index in x.index: print("x中的索引:", index) print("x中的值:", x[index]) print("_____________")
def lagou_spider_keyword(keyword): #将搜索字符串转换为utf-8编码,之后进行lagou.com搜索url构造 keywordbyte=keyword.encode('utf-8') keywordindex=str(keywordbyte).replace(r'\x','%').replace(r"'","") keywordindex=re.sub('^b','',keywordindex) #计算总共有多少搜索结果页 i =0 type='true' url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1) with request.urlopen(url) as f: data=f.read() urlcount=int(json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]["totalPageCount"]) print('本次搜索页面共计%d'%urlcount) #开始正式抓取 for i in list(range(0,urlcount)): #构造页面 if i ==0 : type='true' else: type='false' url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1) with request.urlopen(url) as f: data=f.read() #读取json数据,开始解析 try: jsondata=json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]['result'] for t in list(range(len(jsondata))): #把company描述的列表合并为一个字符串 jsondata[t]['companyLabelList2']='-'.join(jsondata[t]['companyLabelList']) jsondata[t].pop('companyLabelList') #将每一行数据做成Series,之后再合并 if t == 0: rdata=DataFrame(Series(data=jsondata[t])).T else: rdata=pd.concat([rdata,DataFrame(Series(data=jsondata[t])).T]) #重新给rdata编码 rdata.index=range(1,len(rdata)+1) rdata['keyword']=keyword rdata['salarymin']=0 rdata['salarymax']=0 rdata['url']='' rdata['jd']=''#职位描述 rdata['handle_perc']=''#简历及时处理率,在七天内处理完简历占所有简历的比例 rdata['handle_day']=''#完成简历处理平均天数 for klen in list(range(len(rdata['salary']))): rdata.ix[klen+1,'salarymin'] = re.search('^(\d*?)k',rdata['salary'].iloc[klen]).group(1) #如果工资的最大值没有写,如(8k以上),则列为空值 if re.search('-(\d*?)k$',rdata['salary'].iloc[klen]) != None: rdata.ix[klen+1,'salarymax'] = re.search('-(\d*?)k$',rdata['salary'].iloc[klen]).group(1) else: rdata.ix[klen+1,'salarymax'] = '' #增加url一列,便于后续抓取jd内容 rdata.ix[klen+1,'url'] = 'http://www.lagou.com/jobs/%s.html'% rdata.ix[klen+1,'positionId'] #对url进行二次抓取,把jd抓进来 with request.urlopen(rdata.ix[klen+1,'url']) as f: data_url=f.read() soup_url=BeautifulSoup(data_url,'html5lib') strings_url=soup_url.find('dd',class_='job_bt').strings rdata.ix[klen+1,'jd']=''.join(strings_url).encode('gbk','ignore').decode('gbk','ignore').replace(' ','') temp=soup_url.find_all('span',class_='data') if re.search('>(\w*%)<',str(temp[0])) == None:
def hilo(high, low, close, high_length=None, low_length=None, mamode=None, offset=None, **kwargs): """Indicator: Gann HiLo (HiLo)""" # Validate Arguments high_length = int(high_length) if high_length and high_length > 0 else 13 low_length = int(low_length) if low_length and low_length > 0 else 21 mamode = mamode.lower() if isinstance(mamode, str) else "sma" _length = max(high_length, low_length) high = verify_series(high, _length) low = verify_series(low, _length) close = verify_series(close, _length) offset = get_offset(offset) if high is None or low is None or close is None: return # Calculate Result m = close.size hilo = Series(npNaN, index=close.index) long = Series(npNaN, index=close.index) short = Series(npNaN, index=close.index) high_ma = ma(mamode, high, length=high_length) low_ma = ma(mamode, low, length=low_length) for i in range(1, m): if close.iloc[i] > high_ma.iloc[i - 1]: hilo.iloc[i] = long.iloc[i] = low_ma.iloc[i] elif close.iloc[i] < low_ma.iloc[i - 1]: hilo.iloc[i] = short.iloc[i] = high_ma.iloc[i] else: hilo.iloc[i] = hilo.iloc[i - 1] long.iloc[i] = short.iloc[i] = hilo.iloc[i - 1] # Offset if offset != 0: hilo = hilo.shift(offset) long = long.shift(offset) short = short.shift(offset) # Handle fills if "fillna" in kwargs: hilo.fillna(kwargs["fillna"], inplace=True) long.fillna(kwargs["fillna"], inplace=True) short.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: hilo.fillna(method=kwargs["fill_method"], inplace=True) long.fillna(method=kwargs["fill_method"], inplace=True) short.fillna(method=kwargs["fill_method"], inplace=True) # Name & Category _props = f"_{high_length}_{low_length}" data = { f"HILO{_props}": hilo, f"HILOl{_props}": long, f"HILOs{_props}": short } df = DataFrame(data, index=close.index) df.name = f"HILO{_props}" df.category = "overlap" return df
def pipeline(self, ds: pd.Series) -> pd.Series: return ds.pipe(self.pipe_location).pipe(self.pipe_vaccine).pipe(self.pipe_source)
#Filter and select data import numpy as np import pandas as pd from pandas import Series, DataFrame series_obj = Series(np.arange(8), index=[ 'row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6', 'row 7', 'row 8', ]) print(series_obj) print(series_obj['row 3']) print(series_obj[[0, 7]]) np.random.seed(25) DF_obj = DataFrame( np.random.rand(36).reshape(6, 6), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'], columns=[ 'column 1', 'column 2', 'column 3', 'column 4', 'column 5', 'column 6' ]) print(DF_obj) print(DF_obj.ix[['row 1', 'row 2'], ['column 2', 'column 4']])
class TestSeriesToCSV(): def read_csv(self, path, **kwargs): params = dict(squeeze=True, index_col=0, header=None, parse_dates=True) params.update(**kwargs) header = params.get("header") out = pd.read_csv(path, **params) if header is None: out.name = out.index.name = None return out def test_from_csv_deprecation(self, datetime_series): # see gh-17812 with ensure_clean() as path: datetime_series.to_csv(path, header=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): ts = self.read_csv(path) depr_ts = Series.from_csv(path) assert_series_equal(depr_ts, ts) @pytest.mark.parametrize("arg", ["path", "header", "both"]) def test_to_csv_deprecation(self, arg, datetime_series): # see gh-19715 with ensure_clean() as path: if arg == "path": kwargs = dict(path=path, header=False) elif arg == "header": kwargs = dict(path_or_buf=path) else: # Both discrepancies match. kwargs = dict(path=path) with tm.assert_produces_warning(FutureWarning): datetime_series.to_csv(**kwargs) # Make sure roundtrip still works. ts = self.read_csv(path) assert_series_equal(datetime_series, ts, check_names=False) def test_from_csv(self, datetime_series, string_series): with ensure_clean() as path: datetime_series.to_csv(path, header=False) ts = self.read_csv(path) assert_series_equal(datetime_series, ts, check_names=False) assert ts.name is None assert ts.index.name is None with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): depr_ts = Series.from_csv(path) assert_series_equal(depr_ts, ts) # see gh-10483 datetime_series.to_csv(path, header=True) ts_h = self.read_csv(path, header=0) assert ts_h.name == "ts" string_series.to_csv(path, header=False) series = self.read_csv(path) assert_series_equal(string_series, series, check_names=False) assert series.name is None assert series.index.name is None string_series.to_csv(path, header=True) series_h = self.read_csv(path, header=0) assert series_h.name == "series" with open(path, "w") as outfile: outfile.write("1998-01-01|1.0\n1999-01-01|2.0") series = self.read_csv(path, sep="|") check_series = Series({ datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0 }) assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) assert_series_equal(check_series, series) def test_to_csv(self, datetime_series): import io with ensure_clean() as path: datetime_series.to_csv(path, header=False) with io.open(path, newline=None) as f: lines = f.readlines() assert (lines[1] != '\n') datetime_series.to_csv(path, index=False, header=False) arr = np.loadtxt(path) assert_almost_equal(arr, datetime_series.values) def test_to_csv_unicode_index(self): buf = StringIO() s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"]) s.to_csv(buf, encoding="UTF-8", header=False) buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") assert_series_equal(s, s2) def test_to_csv_float_format(self): with ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) ser.to_csv(filename, float_format="%.2f", header=False) rs = self.read_csv(filename) xp = Series([0.12, 0.23, 0.57]) assert_series_equal(rs, xp) def test_to_csv_list_entries(self): s = Series(['jack and jill', 'jesse and frank']) split = s.str.split(r'\s+and\s+') buf = StringIO() split.to_csv(buf, header=False) def test_to_csv_path_is_none(self): # GH 8215 # Series.to_csv() was returning None, inconsistent with # DataFrame.to_csv() which returned string s = Series([1, 2, 3]) csv_str = s.to_csv(path_or_buf=None, header=False) assert isinstance(csv_str, str) @pytest.mark.parametrize( 's,encoding', [ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X'), None), # GH 21241, 21118 (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), (Series(["123", "你好", "世界"], name="中文"), 'gb2312'), (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), 'cp737') ]) def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: s.to_csv(filename, compression=compression, encoding=encoding, header=True) # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv f, _handles = _get_handle(filename, 'w', compression=compression, encoding=encoding) with f: s.to_csv(f, encoding=encoding, header=True) result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal( s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding))
def test_pindex_qaccess(self): pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') s = Series(np.random.rand(len(pi)), index=pi).cumsum() # Todo: fix these accessors! self.assertEqual(s['05Q4'], s[2])
def video_avf_rec(user_id,num_of_rec): rated_by_user = trailer_seen.TrailerSeen.query.filter_by(seen_by=user_id, is_skipped=0) movies_seen=trailer_seen.TrailerSeen.query.filter_by(seen_by=user_id) rated_imdb = {} movies_to_exclude = [] for r in rated_by_user: rated_imdb.update({r.imdb_id: r.rate}) for r in movies_seen: movies_to_exclude.append(r.imdb_id) final_array = [] i = 0 for row in video_avf_sim.itertuples(): num = float(0) den = float(0) i += 1 neigh_splitted = row[3].split(",") for j in neigh_splitted: imdb_sim = j.split(":") imdb = imdb_sim[0] if rated_imdb.get(imdb, None): num += float(rated_imdb.get(imdb)) * float(imdb_sim[1]) den += float(imdb_sim[1]) if den == 0: final_v = float(0) else: final_v = float(num / den) if not final_v: final_v = float(0) final_array.append((row[1], final_v, row[2])) dtype = [('IMDB_ID', 'S10'), ('PREDICTED_VOTE', float), ('IMDB_VOTES', int)] numpy_final = numpy.array(final_array, dtype=dtype) numpy_final = numpy.sort(numpy_final, order=['PREDICTED_VOTE']) numpy_final = numpy_final[::-1] all_table = get_table("all_table")() all_table = all_table[~all_table["IMDB_ID"].isin(Series(movies_to_exclude))] all_table.reset_index(drop=True, inplace=True) final = {} safe_iter = 0 while (len(final) < num_of_rec) and (safe_iter < 20): rec = numpy_final[safe_iter] movie = all_table[all_table["IMDB_ID"] == rec[0]].copy() if len(movie.index): movie.reset_index(drop=True, inplace=True) movie = movie.iloc[0] movie["REC_TYPE"] = "AUDIO" movie["PREDICTED_VOTE"]=rec[1] z = movie.to_json() safe_iter += 1 final.update({len(final): z}) else: safe_iter += 1 return final