def test_multiindex_objects(): mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], codes=[[0, 1, 0, 2], [2, 0, 0, 1]], names=["col1", "col2"]) recons = mi._sort_levels_monotonic() # These are equal. assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) equivalency. expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # Values should match, but in different order. tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_hash_pandas_object(obj): a = hash_pandas_object(obj) b = hash_pandas_object(obj) if isinstance(a, np.ndarray): np.testing.assert_equal(a, b) else: assert_eq(a, b)
def test_multiindex_objects(): mi = MultiIndex( levels=[["b", "d", "a"], [1, 2, 3]], codes=[[0, 1, 0, 2], [2, 0, 0, 1]], names=["col1", "col2"], ) recons = mi._sort_levels_monotonic() # These are equal. assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) equivalency. expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # Values should match, but in different order. tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_object_missing_values(): # Check that the presence of missing values doesn't change how object dtype # is hashed. s = pd.Series(['a', 'b', 'c', None]) h1 = hash_pandas_object(s).iloc[:3] h2 = hash_pandas_object(s.iloc[:3]) tm.assert_series_equal(h1, h2)
def save(self, filename=None): """Save the current recommender. :param filename: string or None Name of file to load """ if filename is None: fn = self.serialized_rec_path else: fn = filename if os.path.isfile(fn): logger.warning('overwriting ' + fn) save_dict = copy.deepcopy(self.__dict__) # remove results_df to save space. this gets loaded by load() fn. if 'results_df' in save_dict.keys(): logger.debug('deleting save_dict[results_df]:' + str(save_dict['results_df'].head())) rowHashes = hash_pandas_object(save_dict['results_df']).values save_dict['results_df_hash'] = hashlib.sha256( rowHashes).hexdigest() del save_dict['results_df'] # remove ml_p to save space rowHashes = hash_pandas_object(save_dict['_ml_p'].apply(str)).values save_dict['ml_p_hash'] = hashlib.sha256(rowHashes).hexdigest() del save_dict['_ml_p'] del save_dict['mlp_combos'] logger.info('saving recommender as ' + fn) f = gzip.open(fn, 'wb') pickle.dump(save_dict, f, 2) f.close()
def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_object_missing_values(): # Check that the presence of missing values doesn't change how object dtype # is hashed. s = pd.Series(["a", "b", "c", None]) h1 = hash_pandas_object(s).iloc[:3] h2 = hash_pandas_object(s.iloc[:3]) tm.assert_series_equal(h1, h2)
def test_drift_detector_lightgbm(self): df = load_bank() y = df.pop('y') X_train, X_test = train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527) dd = DriftDetector() dd.fit(X_train, X_test) assert len(dd.feature_names_) == 17 assert len(dd.feature_importances_) == 17 assert dd.auc_ assert len(dd.estimator_) == 5 proba = dd.predict_proba(df) assert proba.shape[0] == df.shape[0] df = load_bank() y = df.pop('y') p = int(df.shape[0] * 0.2) X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(), y, test_size=0.2) assert X_train.shape == (df.shape[0] - p, df.shape[1]) assert y_train.shape == (df.shape[0] - p,) assert X_test.shape == (p, df.shape[1]) assert y_test.shape == (p,) df['y'] = y X_train['y'] = y_train X_test['y'] = y_test df_split = pd.concat([X_train, X_test]) df_hash = hash_pandas_object(df).sort_values() splitted_hash = hash_pandas_object(df_split).sort_values() assert (df_hash == splitted_hash).all()
def test_drift_detector_split(self): df = dd.from_pandas(load_bank(), npartitions=2) y = df.pop('y') X_train, X_test = DaskToolBox.train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527) ddr = dd_selector().get_detector() ddr.fit(X_train, X_test) assert len(ddr.feature_names_) == 17 assert len(ddr.feature_importances_) == 17 assert ddr.auc_ assert len(ddr.estimator_) == 5 proba = ddr.predict_proba(df) assert proba.compute().shape[0] == len(df) df = dd.from_pandas(load_bank(), npartitions=2) y = df.pop('y') p = int(len(df) * 0.2) X_train, X_test, y_train, y_test = ddr.train_test_split(df.copy(), y, test_size=0.2, remain_for_train=0.) df, X_train, X_test, y_train, y_test = DaskToolBox.compute(df, X_train, X_test, y_train, y_test) assert X_train.shape == (df.shape[0] - p, df.shape[1]) assert y_train.shape == (df.shape[0] - p,) assert X_test.shape == (p, df.shape[1]) assert y_test.shape == (p,) df['y'] = y X_train['y'] = y_train X_test['y'] = y_test df_split = pd.concat([X_train, X_test]) df_hash = hash_pandas_object(df).sort_values() splitted_hash = hash_pandas_object(df_split).sort_values() assert (df_hash == splitted_hash).all()
def test_hash_with_tuple(): # GH#28969 array containing a tuple raises on call to arr.astype(str) # apparently a numpy bug github.com/numpy/numpy/issues/9441 df = DataFrame({"data": [tuple("1"), tuple("2")]}) result = hash_pandas_object(df) expected = Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) tm.assert_series_equal(result, expected) df2 = DataFrame({"data": [(1, ), (2, )]}) result = hash_pandas_object(df2) expected = Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) tm.assert_series_equal(result, expected) # require that the elements of such tuples are themselves hashable df3 = DataFrame({"data": [ ( 1, [], ), ( 2, {}, ), ]}) with pytest.raises(TypeError, match="unhashable type: 'list'"): hash_pandas_object(df3)
def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object( mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object( recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def identify_compatible_groups(dataframes_with_metadata): already_classified = set() compatible_groups = [] for t1, path1, md1 in dataframes_with_metadata: # these local variables are for this one view compatible_group = [path1] hashes1 = hash_pandas_object(t1, index=False) ht1 = hashes1.sum() if path1 in already_classified: continue for t2, path2, md2 in dataframes_with_metadata: if path1 == path2: # same table continue # if t2 is in remove group if path2 in already_classified: continue hashes2 = hash_pandas_object(t2, index=False) ht2 = hashes2.sum() # are views compatible if ht1 == ht2: compatible_group.append(path2) already_classified.add(path1) already_classified.add(path2) # if len(compatible_group) > 1: # cannot check this condition because now all views are analyzed from compatible groups compatible_groups.append(compatible_group) return compatible_groups
def test_pandas_errors(self): with pytest.raises(TypeError): hash_pandas_object(pd.Timestamp('20130101')) obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj)
def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b)
def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) if len(obj): assert not (a == b).all()
def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') assert (a != b).all()
def test_pandas_errors(self): with pytest.raises(TypeError): hash_pandas_object(pd.Timestamp('20130101')) with catch_warnings(record=True): obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj)
def test_df_hash_keys(): # DataFrame version of the test_hash_keys. # https://github.com/pandas-dev/pandas/issues/41404 obj = DataFrame({"x": np.arange(3), "y": list("abc")}) a = hash_pandas_object(obj, hash_key="9876543210123456") b = hash_pandas_object(obj, hash_key="9876543210123465") assert (a != b).all()
def summarize_views_and_find_candidate_complementary(dataframes_with_metadata): already_processed_complementary_pairs = set() contained_groups = [] candidate_complementary_groups = [] for df1, path1, md1 in dataframes_with_metadata: # these local variables are for this one view contained_group = [path1] hashes1_list = hash_pandas_object( df1, index=False) # we only consider content hashes1_set = set(hashes1_list) for df2, path2, md2 in dataframes_with_metadata: if path1 == path2: # same table continue hashes2_list = hash_pandas_object(df2, index=False) hashes2_set = set(hashes2_list) # are views potentially contained if len(hashes1_set) > len(hashes2_set): # is t2 contained in t1? if len(hashes2_set - hashes1_set) == 0: contained_group.append(path2) else: if (path1 + "%%%" + path2) in already_processed_complementary_pairs\ or (path2 + "%%%" + path1) in already_processed_complementary_pairs: continue # already processed, skip computation # Verify that views are potentially complementary s12 = (hashes1_set - hashes2_set) s1_complement = set() if len(s12) > 0: s1_complement.update((s12)) s21 = (hashes2_set - hashes1_set) s2_complement = set() if len(s21) > 0: s2_complement.update((s21)) if len(s1_complement) > 0 and len( s2_complement ) > 0: # and, otherwise it's a containment rel idx1 = [ idx for idx, value in enumerate(hashes1_list) if value in s1_complement ] idx2 = [ idx for idx, value in enumerate(hashes2_list) if value in s2_complement ] candidate_complementary_groups.append( (df1, md1, path1, idx1, df2, md2, path2, idx2)) already_processed_complementary_pairs.add( (path1 + "%%%" + path2)) already_processed_complementary_pairs.add( (path2 + "%%%" + path1)) if len(contained_group) > 1: contained_groups.append(contained_group) return contained_groups, candidate_complementary_groups
def test_hash_keys(): # Using different hash keys, should have # different hashes for the same data. # # This only matters for object dtypes. obj = Series(list("abc")) a = hash_pandas_object(obj, hash_key="9876543210123456") b = hash_pandas_object(obj, hash_key="9876543210123465") assert (a != b).all()
def test_deprecation(): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): from pandas.tools.hashing import hash_pandas_object obj = Series(list('abc')) hash_pandas_object(obj, hash_key='9876543210123456') with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): from pandas.tools.hashing import hash_array obj = np.array([1, 2, 3]) hash_array(obj, hash_key='9876543210123456')
def test_df_encoding(): # Check that DataFrame recognizes optional encoding. # https://github.com/pandas-dev/pandas/issues/41404 # https://github.com/pandas-dev/pandas/pull/42049 obj = DataFrame({"x": np.arange(3), "y": list("a+c")}) a = hash_pandas_object(obj, encoding="utf8") b = hash_pandas_object(obj, encoding="utf7") # Note that the "+" is encoded as "+-" in utf-7. assert a[0] == b[0] assert a[1] != b[1] assert a[2] == b[2]
def test_categorical_consistency(self, s1, categorize): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def _check_equal(obj, **kwargs): """ Check that hashing an objects produces the same value each time. Parameters ---------- obj : object The object to hash. kwargs : kwargs Keyword arguments to pass to the hashing function. """ a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b)
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) if isinstance(obj, pd.Index): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') h = pd.Series(h, index=obj, dtype='uint64') elif isinstance(obj, pd.Series): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') if index: h = adder( h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, pd.DataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, hash_key, categorize).astype('uint64') for _, col in cols: h = adder( h, hash_array(col.values, encoding, hash_key, categorize)) if index: h = adder( h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h
def test_categorical_consistency(): # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [pd.Series(['a', 'b', 'c', 'd']), pd.Series([1000, 2000, 3000, 4000]), pd.Series(pd.date_range(0, periods=4))]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def _check_not_equal_with_index(obj): """ Check the hash of an object with and without its index is not the same. Parameters ---------- obj : object The object to hash. """ if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) if len(obj): assert not (a == b).all()
def test_hashable_tuple_args(): # require that the elements of such tuples are themselves hashable df3 = DataFrame({"data": [ ( 1, [], ), ( 2, {}, ), ]}) with pytest.raises(TypeError, match="unhashable type: 'list'"): hash_pandas_object(df3)
def test_multiindex_unique(): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique is True result = hash_pandas_object(mi) assert result.is_unique is True
def evaluate(self): """ Use cryptocomapre API to evaluate portfolio in given currency """ # load backup file if exist and compare hash of previous p_raw to current p_raw before getting historical data cryptocompare API (slow) # create a tmp dir in the same folder as the as the ledger csv tmp_dir = os.path.dirname(os.path.abspath( self.ledger.csv_db_path)) + '/tmp' if not os.path.exists( tmp_dir): # create the tmp directory if not existing os.makedirs(tmp_dir) file_path = tmp_dir + '/p_eval_' + self.eval_symbol + '.pkl' hash_path = tmp_dir + '/p_raw_hash_' + self.eval_symbol current_hash = str(hash_pandas_object(self.p_raw).sum()) old_hash = '' is_loaded = True # NOTE: better way would be to check if path is valid (although doesn't guarantee it can be open) try: self.p_eval = pd.read_pickle( file_path) #try to load previous file back with open(hash_path) as f: old_hash = f.read() except IOError: # If fail, evalulate the portfolio and save the file after is_loaded = False if not is_loaded or old_hash != current_hash: # file was open but is not up to date (note that new transaction will only be updated on next day) self.p_eval = self.p_raw.apply(Portfolio.valuation, args=(self.eval_symbol, )) self.p_eval.to_pickle(file_path) with open(hash_path, "w") as f: f.write(current_hash) return self.p_eval
def test_categorical_consistency(self): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4))]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def compute_hll_array(obj, b): # b is the number of bits if not 8 <= b <= 16: raise ValueError("b should be between 8 and 16") num_bits_discarded = 32 - b m = 1 << b # Get an array of the hashes hashes = hash_pandas_object(obj, index=False) if isinstance(hashes, pd.Series): hashes = hashes._values hashes = hashes.astype(np.uint32) # Of the first b bits, which is the first nonzero? j = hashes >> num_bits_discarded first_bit = compute_first_bit(hashes) # Pandas can do the max aggregation df = pd.DataFrame({"j": j, "first_bit": first_bit}) series = df.groupby("j").max()["first_bit"] # Return a dense array so we can concat them and get a result # that is easy to deal with return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
def __init__(self, args): super(RecoModelRTAE, self).__init__() self.cpu_device = torch.device('cpu') self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.device = self.cpu_device self.args = args self.run_name = f"{args['algorithm']}_{args['P']}_{args['batch_size']}_{args['K']}_{datetime.datetime.now()}" self.use_em = args['use_em'] # Dataset stuff if isinstance(args['dataset'], str): self.data_hash = None fn_train = args['dataset'] + '/train.csv' fn_test = args['dataset'] + '/test.csv' self.dataset = PandasDataset(args['P'], pd.read_csv(fn_train), testing=True) self.dataset_test = PandasDataset(args['P'], pd.read_csv(fn_test), testing=True) else: # support the direct insertion of a dataframe into the dataset self.dataset = PandasDataset(args['P'], args['dataset'], testing=True) self.data_hash = hash_pandas_object(args['dataset'].v).sum()
def isotherm_to_hash(isotherm): """ Convert an isotherm object to a unique hash. Parameters ---------- isotherm : PointIsotherm Isotherm to be hashed. Returns ------- str A string with the Isotherm hash. """ # Isotherm properties raw_dict = isotherm.to_dict() # Isotherm data or model if isinstance(isotherm, pygaps.PointIsotherm): raw_dict["data_hash"] = str( hash_pandas_object(isotherm.data_raw.round(8)).sum()) elif isinstance(isotherm, pygaps.ModelIsotherm): raw_dict["data_hash"] = isotherm.model.to_dict() md_hasher = hashlib.md5( json.dumps(raw_dict, sort_keys=True).encode('utf-8')) return md_hasher.hexdigest()
def compute_hll_array(obj, b): # b is the number of bits if not 8 <= b <= 16: raise ValueError('b should be between 8 and 16') num_bits_discarded = 32 - b m = 1 << b # Get an array of the hashes hashes = hash_pandas_object(obj, index=False) if isinstance(hashes, pd.Series): hashes = hashes._values hashes = hashes.astype(np.uint32) # Of the first b bits, which is the first nonzero? j = hashes >> num_bits_discarded first_bit = compute_first_bit(hashes) # Pandas can do the max aggregation df = pd.DataFrame({'j': j, 'first_bit': first_bit}) series = df.groupby('j').max()['first_bit'] # Return a dense array so we can concat them and get a result # that is easy to deal with return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
def _hash_array_like_obj_as_bytes(data): """ Helper method to convert pandas dataframe/numpy array/list into bytes for MD5 calculation purpose. """ from pandas.util import hash_pandas_object import numpy as np import pandas as pd if isinstance(data, pd.DataFrame): # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user # run code not related to pyspark. if "pyspark" in sys.modules: from pyspark.ml.linalg import Vector as spark_vector_type else: spark_vector_type = None def _hash_array_like_element_as_bytes(v): if spark_vector_type is not None: if isinstance(v, spark_vector_type): return _hash_ndarray_as_bytes(v.toArray()) if isinstance(v, np.ndarray): return _hash_ndarray_as_bytes(v) if isinstance(v, list): return _hash_ndarray_as_bytes(np.array(v)) return v data = data.applymap(_hash_array_like_element_as_bytes) return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data)) elif isinstance(data, np.ndarray): return _hash_ndarray_as_bytes(data) elif isinstance(data, list): return _hash_ndarray_as_bytes(np.array(data)) else: raise ValueError("Unsupported data type.")
def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) assert result == expected[0]
def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected)
def test_hash_tuples(): tuples = [(1, "one"), (1, "two"), (2, "one")] result = hash_tuples(tuples) expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tuples[0]) assert result == expected[0]
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) if isinstance(obj, pd.Index): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') h = pd.Series(h, index=obj, dtype='uint64') elif isinstance(obj, pd.Series): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, pd.DataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, hash_key, categorize).astype('uint64') for _, col in cols: h = adder(h, hash_array(col.values, encoding, hash_key, categorize)) if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h
def partitioning_index(df, npartitions): """ Computes a deterministic index mapping each record to a partition. Identical rows are mapped to the same partition. Parameters ---------- df : DataFrame/Series/Index npartitions : int The number of partitions to group into. Returns ------- partitions : ndarray An array of int64 values mapping each record to a partition. """ return hash_pandas_object(df, index=False) % int(npartitions)
def shuffle_group(df, col, stage, k, npartitions): """ Splits dataframe into groups The group is determined by their final partition, and which stage we are in in the shuffle Parameters ---------- df: DataFrame col: str Column name on which to split the dataframe stage: int We shuffle dataframes with many partitions we in a few stages to avoid a quadratic number of tasks. This number corresponds to which stage we're in, starting from zero up to some small integer k: int Desired number of splits from this dataframe npartition: int Total number of output partitions for the full dataframe Returns ------- out: Dict[int, DataFrame] A dictionary mapping integers in {0..k} to dataframes such that the hash values of ``df[col]`` are well partitioned. """ if col == '_partitions': ind = df[col] else: ind = hash_pandas_object(df[col], index=False) c = ind._values typ = np.min_scalar_type(npartitions * 2) c = np.mod(c, npartitions).astype(typ, copy=False) np.floor_divide(c, k ** stage, out=c) np.mod(c, k, out=c) indexer, locations = groupsort_indexer(c.astype(np.int64), k) df2 = df.take(indexer) locations = locations.cumsum() parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])] return dict(zip(range(k), parts))
def test_pandas_errors(obj): msg = "Unexpected type for hashing" with pytest.raises(TypeError, match=msg): hash_pandas_object(obj)
def test_invalid_key(): # This only matters for object dtypes. msg = "key should be a 16-byte string encoded" with pytest.raises(ValueError, match=msg): hash_pandas_object(Series(list("abc")), hash_key="foo")
def f(): hash_pandas_object(Series(list('abc')), hash_key='foo')
def test_invalid_key(self): # this only matters for object dtypes msg = 'key should be a 16-byte string encoded' with tm.assert_raises_regex(ValueError, msg): hash_pandas_object(Series(list('abc')), hash_key='foo')