def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. s1 = Series([np.nan, True]) s2 = Series([np.nan, False]) assert s1.all(skipna=False) # nan && True => True assert s1.all(skipna=True) assert np.isnan(s2.any(skipna=False)) # nan || False => nan assert not s2.any(skipna=True) # Check level. s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) # bool_only is not implemented with level option. with pytest.raises(NotImplementedError): s.any(bool_only=True, level=0) with pytest.raises(NotImplementedError): s.all(bool_only=True, level=0) # bool_only is not implemented alone. with pytest.raises(NotImplementedError): s.any(bool_only=True) with pytest.raises(NotImplementedError): s.all(bool_only=True)
def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. s1 = Series([np.nan, True]) s2 = Series([np.nan, False]) assert s1.all(skipna=False) # nan && True => True assert s1.all(skipna=True) assert np.isnan(s2.any(skipna=False)) # nan || False => nan assert not s2.any(skipna=True) # Check level. s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) # bool_only is not implemented with level option. with pytest.raises(NotImplementedError): s.any(bool_only=True, level=0) with pytest.raises(NotImplementedError): s.all(bool_only=True, level=0) # bool_only is not implemented alone. with pytest.raises(NotImplementedError): s.any(bool_only=True,) with pytest.raises(NotImplementedError): s.all(bool_only=True)
def test_all_any_boolean(self): # Check skipna, with boolean type s1 = Series([pd.NA, True], dtype="boolean") s2 = Series([pd.NA, False], dtype="boolean") assert s1.all(skipna=False) is pd.NA # NA && True => NA assert s1.all(skipna=True) assert s2.any(skipna=False) is pd.NA # NA || False => NA assert not s2.any(skipna=True) # GH-33253: all True / all False values buggy with skipna=False s3 = Series([True, True], dtype="boolean") s4 = Series([False, False], dtype="boolean") assert s3.all(skipna=False) assert not s4.any(skipna=False) # Check level TODO(GH-33449) result should also be boolean s = Series( [False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2], dtype="boolean", ) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.all(level=0), Series([False, True, False])) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
class Any: params = [[10**3, 10**6], ["fast", "slow"]] param_names = ["N", "case"] def setup(self, N, case): val = case == "fast" self.s = Series([val] * N) def time_any(self, N, case): self.s.any()
class Any(object): params = [[10**3, 10**6], ['fast', 'slow']] param_names = ['N', 'case'] def setup(self, N, case): val = case == 'fast' self.s = Series([val] * N) def time_any(self, N, case): self.s.any()
def _fill_res_dict(col: pd.Series, col_oob: pd.Series, res_dict: dict) -> dict: valid = not col_oob.any() res_dict["valid"] = valid if not valid: col_oob = col_oob.fillna(False) n = global_log_verbosity # get the unexpected values unexpected_index = col_oob.index[col_oob] unexpected_values = col[unexpected_index].astype(str) res_dict["percentage_of_column_is_error"] = (len(unexpected_index) / len(col) * 100) if n is not None: # if the global_log_verbosity is not 0, sample if n != 0: # asking for a higher sample than is there? if global_log_verbosity > len(unexpected_values): n = len(unexpected_values) # sample the requested amount unexpected_values = unexpected_values.sample(n=n) unexpected_index = unexpected_values[unexpected_values.index] # log the required unexpected values res_dict["unexpected_index_sample"] = unexpected_index.tolist() res_dict["unexpected_values_sample"] = unexpected_values.tolist() return res_dict
def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. s1 = Series([np.nan, True]) s2 = Series([np.nan, False]) assert s1.all(skipna=False) # nan && True => True assert s1.all(skipna=True) assert s2.any(skipna=False) assert not s2.any(skipna=True) # Check level. s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.all(level=0), Series([False, True, False])) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.any(level=0), Series([False, True, True])) msg = "Option bool_only is not implemented with option level" with pytest.raises(NotImplementedError, match=msg): with tm.assert_produces_warning(FutureWarning): s.any(bool_only=True, level=0) with pytest.raises(NotImplementedError, match=msg): with tm.assert_produces_warning(FutureWarning): s.all(bool_only=True, level=0) # GH#38810 bool_only is not implemented alone. msg = "Series.any does not implement bool_only" with pytest.raises(NotImplementedError, match=msg): s.any(bool_only=True) msg = "Series.all does not implement bool_only." with pytest.raises(NotImplementedError, match=msg): s.all(bool_only=True)
def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. s1 = Series([np.nan, True]) s2 = Series([np.nan, False]) assert s1.all(skipna=False) # nan && True => True assert s1.all(skipna=True) assert np.isnan(s2.any(skipna=False)) # nan || False => nan assert not s2.any(skipna=True) # Check level. s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) msg = "Option bool_only is not implemented with option level" with pytest.raises(NotImplementedError, match=msg): s.any(bool_only=True, level=0) with pytest.raises(NotImplementedError, match=msg): s.all(bool_only=True, level=0) # bool_only is not implemented alone. # TODO GH38810 change this error message to: # "Series.any does not implement bool_only" msg = "Series.any does not implement numeric_only" with pytest.raises(NotImplementedError, match=msg): s.any(bool_only=True) msg = "Series.all does not implement numeric_only." with pytest.raises(NotImplementedError, match=msg): s.all(bool_only=True)
def test_all_any(self): ts = tm.makeTimeSeries() bool_series = ts > 0 assert not bool_series.all() assert bool_series.any() # Alternative types, with implicit 'object' dtype. s = Series(["abc", True]) assert "abc" == s.any() # 'abc' || True => 'abc'
def test_all_any(self): ts = tm.makeTimeSeries() bool_series = ts > 0 assert not bool_series.all() assert bool_series.any() # Alternative types, with implicit 'object' dtype. s = Series(['abc', True]) assert 'abc' == s.any() # 'abc' || True => 'abc'
def test_any_all_datetimelike(self): # GH#38723 these may not be the desired long-term behavior (GH#34479) # but in the interim should be internally consistent dta = date_range("1995-01-02", periods=3)._data ser = Series(dta) df = DataFrame(ser) assert dta.all() assert dta.any() assert ser.all() assert ser.any() assert df.any().all() assert df.all().all() dta = dta.tz_localize("UTC") ser = Series(dta) df = DataFrame(ser) assert dta.all() assert dta.any() assert ser.all() assert ser.any() assert df.any().all() assert df.all().all() tda = dta - dta[0] ser = Series(tda) df = DataFrame(ser) assert tda.any() assert not tda.all() assert ser.any() assert not ser.all() assert df.any().all() assert not df.all().any()
def test_any_non_keyword_deprecation(): df = DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]}) msg = ( "In a future version of pandas all arguments of " "DataFrame.any and Series.any will be keyword-only." ) with tm.assert_produces_warning(FutureWarning, match=msg): result = df.any("index", None) expected = Series({"A": True, "B": True, "C": False}) tm.assert_series_equal(result, expected) s = Series([False, False, False]) msg = ( "In a future version of pandas all arguments of " "DataFrame.any and Series.any will be keyword-only." ) with tm.assert_produces_warning(FutureWarning, match=msg): result = s.any("index") expected = False tm.assert_equal(result, expected)
def threshold_cluster(Data_set,threshold): stand_array=np.asarray(Data_set).ravel('C') stand_Data=Series(stand_array) index_list,class_k=[],[] while stand_Data.any(): if len(stand_Data)==1: index_list.append(list(stand_Data.index)) class_k.append(list(stand_Data)) stand_Data=stand_Data.drop(stand_Data.index) else: class_data_index=stand_Data.index[0] class_data=stand_Data[class_data_index] stand_Data=stand_Data.drop(class_data_index) if (abs(stand_Data-class_data)<=threshold).any(): args_data=stand_Data[abs(stand_Data-class_data)<=threshold] stand_Data=stand_Data.drop(args_data.index) index_list.append([class_data_index]+list(args_data.index)) class_k.append([class_data]+list(args_data)) else: index_list.append([class_data_index]) class_k.append([class_data]) return index_list,class_k
def load(self, datelist, tracklist=[], path=''): """ load all JCapper Race files (aka past performance files) for the specified dates :param datelist: list of dates :param tracklist: list of x8 track symbols e.g. DMR, etc :param path: str, if given, load data from given directory, otherwise, load data directly from s3 """ schema_filepath = os.path.join(os.path.dirname(__file__), 'schema_pastperformance.csv') columnDict = read_csv(schema_filepath)['field_name'].to_dict() # convert tracklist symbols to jcp track symbols tracklist = Series(tracklist).map(self.map_track_x8_to_jcp) if tracklist.isnull().any(): raise ValueError( 'tracklist must be list of x8 track symbols in track_detail: \n%s' % tracklist) # raw data self.dfraw = DataFrame() if not path: # load each date and concat to the master raw df for d in datelist: # load the DataFrame for this date, e.g. DMR0831F.TXT for 2017-08-31 year = d.strftime('%Y') month = d.strftime('%m') day = d.strftime('%d') # skip Christmas, no jcapper file if month == '12' and day in ['24', '25']: continue key = 'x8-bucket/jcapper/%s/%s/%s/' % (year, month, day) s3_files = self.s3.ls( key ) # list of all files in a given direcetory - in this case, all files for a single day # filter for .jcp files, in this case for non Chart Files s3_files = [ os.path.basename(fp) for fp in s3_files if fp[-5] != 'F' ] # filter tracks if len(tracklist) > 0: s3_files = [ fp for fp in s3_files if fp[:3] in list(tracklist) ] idx_s3files = Series([n[:3] for n in s3_files ]).drop_duplicates().index if self.verbose: print('pp.load(%s) loading %s race cards..' % (d.strftime('%Y-%m-%d'), len(idx_s3files))) # load all past performance files for given date, track is no longer a condition for i in idx_s3files: fp = os.path.join(key, s3_files[i]) if fp[-3:] == 'jcp': df = read_csv(self.s3.open(fp, mode='rb'), header=None, encoding='ISO-8859-1') else: df = read_csv(self.s3.open(fp, mode='rb'), header=None, compression='zip', encoding='ISO-8859-1') # concat in the master df self.dfraw = concat([self.dfraw, df]) else: # load each date and concat to the master raw df for d in datelist: # load the DataFrame for this date, e.g. DMR0831F.TXT for 2017-08-31 year = d.strftime('%Y') month = d.strftime('%m') day = d.strftime('%d') # skip Christmas, no jcapper file if month == '12' and day in ['24', '25']: continue path_day = os.path.join(path, 'jcapper', year, month, day) files = os.listdir( path_day ) # list of all files in a given direcetory - in this case, all files for a single day # filter tracks if tracklist.any(): files = [ fp for fp in files if os.path.basename(fp)[:3] in list(tracklist) ] if self.verbose: print('pp.load(%s) loading %s race cards..' % (d.strftime('%Y-%m-%d'), len(files))) # load all past performance files for given date, track is no longer a condition for fp in files: # filter for .jcp files, in this case for non Chart Files if fp[-5] != 'F': fp = os.path.join(path_day, fp) if fp[-3:] == 'jcp': df = read_csv(fp, header=None, encoding='ISO-8859-1') else: df = read_csv(fp, header=None, compression='zip', encoding='ISO-8859-1') # concat in the master df self.dfraw = concat([self.dfraw, df]) try: # copy a subset of columns and replace the header names (numbers to text) cols = list(columnDict.keys()) self.df = self.dfraw[cols].copy() except KeyError: raise Exception( 'No files available for given datelist and tracklist.') # column names self.df.rename(columns=columnDict, inplace=True) # normalize track sym and make race_id self.df['x8_track_sym'] = self.df['jcp_track_sym'].map( self.map_track_jcp_to_x8) # adding itsp_track_sym here so that we can filter for bettable tracks in daily races self.df['itsp_track_sym'] = self.df['x8_track_sym'].map( self.map_track_x8_to_itsp) # drop rows where we are missing jcp symbol mapping in track detail if any x8_isnull = self.df['x8_track_sym'].isnull() if x8_isnull.any(): missing_jcp_symbols = self.df[x8_isnull]['jcp_track_sym'].unique() warn( 'pp.load() track_detail.csv is missing jcp symbols: %s\nDropping all rows with missing symbols' % missing_jcp_symbols) self.df = self.df[~x8_isnull] print('pp.load() dropping %s rows' % x8_isnull.sum()) # convert dates and validate self.df['race_time_flag'] = self.df['race_time'].isnull( ) # flag bad race_time values (sometimes is null) self.df['race_time'] = to_datetime( self.df['date'].astype(str) + self.df['race_time'].fillna(1000.0).astype(int).astype(str), format='%Y%m%d%H%M') self.df['race_time_utc'] = self.df['race_time'].map( lambda x: x + timedelta(hours=8)) self.df['race_time_toronto'] = self.df['race_time'].map( lambda x: x + timedelta(hours=3)) self.df['date'] = to_datetime(self.df['date'], format='%Y%m%d') self.df['date_str'] = self.df['date'].dt.strftime('%Y%m%d') self._birthdate_columns() # clean nans in wk date cols fields_wk_date = [ c for c in self.df.columns if c.startswith('wk_date') ] self.df['wk_date_1'].fillna(self.df['date_str'], inplace=True) self.df[fields_wk_date] = self.df[fields_wk_date].fillna( method='ffill', axis=1) self.df[fields_wk_date] = self.df[fields_wk_date].applymap( lambda x: to_datetime(str(int(x)), format='%Y%m%d')) # clean nans in pp date cols fields_pp_date = [ c for c in self.df.columns if c.startswith('pp_date') ] self.df['pp_date_0'].fillna(self.df['date_str'], inplace=True) self.df[fields_pp_date] = self.df[fields_pp_date].fillna( method='ffill', axis=1) self.df[fields_pp_date] = self.df[fields_pp_date].applymap( lambda x: to_datetime(str(int(x)), format='%Y%m%d')) self.df['race_id'] = self.df['x8_track_sym'] + '_' + self.df[ 'date_str'] + '_' + self.df['race_race_num'].astype(str) self.df['runner_program_number'] = self.df[ 'runner_program_number'].map(str) self.df['betting_interest'] = self.df[ 'runner_program_number'].str.strip('A') self.df['coupled'] = self.df['runner_program_number'].str.count( 'A').astype(bool) self.df['coupled_race'] = self.df.groupby( 'race_id')['coupled'].transform('any') self.df['runner_id'] = self.df['race_id'] + '_' + self.df[ 'runner_program_number'] # additional time index data and day of week for seasonality self.df['month'] = self.df['date'].map(lambda x: x.month) self.df['weekday'] = self.df['date'].map(lambda x: x.strftime('%A')) self.df['year'] = self.df['date'].map(lambda x: x.year) self.df['weeknum'] = self.df['date'].map(lambda x: x.strftime('%w')) # normalize horse name self.df['x8name'] = self.df['name'].map(self._normalize_name) self.df['x8country'] = self.df['name'].map(self._country_from_name) # convert pp_track and wk_track columns to x8 symbol fields_pp_track = [ c for c in self.df.columns if c.startswith('pp_track_') ] self.df[fields_pp_track] = self.df[fields_pp_track].applymap( lambda x: self.map_track_chart_to_x8.get(x)) fields_wk_track = [ c for c in self.df.columns if c.startswith('wk_track_') ] self.df[fields_wk_track] = self.df[fields_wk_track].applymap( lambda x: self.map_track_chart_to_x8.get(x)) # make dataframes for historical pp columns and wk columns that are multiindexed by date self._index_pp_columns() self._index_wk_columns() # validate df self._validate(datelist)