def warm2Columns(dfSrc,column1,column2,badFlag,binSize=10,countLimit=200): df = dfSrc[[column1,column2,badFlag]].copy() bins1 = np.unique(algos.quantile(df[column1], np.linspace(0, 1, binSize+1))) bins2 = np.unique(algos.quantile(df[column2], np.linspace(0, 1, binSize+1))) df[column1+'_bin'] = pd.tools.tile._bins_to_cuts(df[column1], bins1, include_lowest=True) df[column2+'_bin'] = pd.tools.tile._bins_to_cuts(df[column2], bins2, include_lowest=True) pvMean = df.pivot_table(badFlag,column1+'_bin',column2+'_bin',np.mean).fillna(0) pvSize = df.pivot_table(badFlag,column1+'_bin',column2+'_bin',np.size).fillna(0) if сheckIndex(pvSize): for ind in pvSize.index: for col in pvSize.columns: if np.isnan(pvSize.loc[ind,col].values[0][0]): pvMean.loc[ind,col]=0 elif pvSize.loc[ind,col].values[0][0]<countLimit: pvMean.loc[ind,col]=0 else: for ind in pvSize.index: for col in pvSize.columns: if np.isnan(pvSize.loc[ind,col]): pvMean.loc[ind,col]=0 elif pvSize.loc[ind,col] < countLimit: pvMean.loc[ind,col]=0 ss = sns.heatmap(pvMean,annot=True) return pvMean,pvSize
def show_orders_hist(order_pd, s_list=None, q_default=10): if s_list is None: s_list = ['lowBkCnt', 'atr_std', 'jump_power', 'diff_days', 'wave_score1', 'wave_score2', 'wave_score3', 'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd'] s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list) for sn in s_list: uq = len(np.unique(order_pd[sn])) if uq == 1: continue bins = 10 bins = uq // 50 if uq // 50 > bins else bins order_pd[sn].hist(bins=bins) plt.show() try: cats = pd.qcut(order_pd[sn], q_default) except Exception: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1)) cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True) # ZLog.info(sn + ' qcut except use bins!') ZLog.info('{0} show hist and qcuts'.format(sn)) ZLog.info(cats.value_counts())
def qcut(x, q, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. precision : int The precision at which to store and display the bins labels Returns ------- out : Categorical or Series or array of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if input is a Series else Categorical. Bins are represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- >>> pd.qcut(range(5), 4) [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ x_is_series, series_index, name, x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name)
def rmg_qcut(x, q, labels=None, retbins=False, precision=3): if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) bins = np.unique(bins) return pandas.tools.tile._bins_to_cuts(x, bins, labels=labels, retbins=retbins, precision=precision, include_lowest=True)
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) self.assert_(np.array_equal(labels, ex_levels))
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) tm.assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) self.assert_categorical_equal(labels, ex_levels)
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) ex_bins[0] -= (arr.max() - arr.min()) * 0.001 assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins) self.assert_(np.array_equal(labels, ex_levels))
def __init__(self, df): df.columns = [x.lower() for x in df.columns] target = [col for col in df.columns if 'target' in col] target = ''.join(target) dfdropped = df[['id', target]] to_drop = ['id', target] dfc = df.drop(to_drop, axis=1) #num_variables = len(dfc.columns.tolist()) - 5 #r = random.randint(0, num_variables) #print(r, r+5) dfc_subset = pd.DataFrame() col_names = dfc.columns.tolist() print(col_names) print(type(col_names)) for i in range(len(col_names)): if dfc[col_names[i]].max() - dfc[col_names[i]].min() > 5: dfc_subset[col_names[i]] = df[col_names[i]] print(dfc_subset) df_names = dfc_subset.columns.values index = len(df_names) - 1 #CREATION OF VARIABLES for i in np.delete(df_names, index): for j in np.delete(df_names, index): dfc[i+"/"+j] = np.where(dfc[j]==0,0,dfc[i]/dfc[j]) dfc[i+"*"+j] = dfc[i]*dfc[j] dfc[i+"-"+j] = dfc[i]+dfc[j] dfc[i+"+"+j] = dfc[i]+dfc[j] #DUMMY AND BINNING col_names_subset = dfc_subset.columns.tolist() dfc_bins = pd.DataFrame() for col in range(len(col_names_subset)): bins = algos.quantile(np.unique(dfc_subset[col_names_subset[col]]), np.linspace(0, 1, 11)) result = pd.tools.tile._bins_to_cuts(dfc_subset[col_names_subset[col]], bins, include_lowest=True) dfc_bins[col_names_subset[col]] = result #dfc_bins = pd.qcut(dfc_subset[col_names_subset[col]], 5) print("\n Exported csv file with variables created from all possible ratios, binning, and created dummy variables for a subset of variables") autocreated = pd.concat([dfdropped, dfc, dfc_bins], axis=1) autocreated.to_csv("autocreatedvariablesdataset.csv")
def make_boost_dummies(self, orderPd, cats_ss, prefix, regex): try: cats = pd.qcut(cats_ss, self.qcut_bins) except Exception, e: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(cats_ss), np.linspace(0, 1, self.qcut_bins + 1)) cats = pd.tools.tile._bins_to_cuts(cats_ss, bins, include_lowest=True) ZLog.info(prefix + ' qcut except use bins!')
def test_qcut(self): arr = np.random.randn(1000) # We store the bins as Index that have been rounded # to comparisons are a bit tricky. labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) result = labels.categories.left.values assert np.allclose(result, ex_bins[:-1], atol=1e-2) result = labels.categories.right.values assert np.allclose(result, ex_bins[1:], atol=1e-2) ex_levels = cut(arr, ex_bins, include_lowest=True) tm.assert_categorical_equal(labels, ex_levels)
def qcut(x, q, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None Labels to use for bin edges, or False to return integer bin labels retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. precision : int The precision at which to store and display the bins labels Returns ------- cat : Categorical or Series Returns a Series of type category if input is a Series else Categorical. Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) res = _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision, include_lowest=True) if isinstance(x, Series): res = Series(res, index=x.index) return res
def qcutnew(x, q, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. precision : int The precision at which to store and display the bins labels Returns ------- out : Categorical or Series or array of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if input is a Series else Categorical. Bins are represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. Notes ----- Out of bounds values will be NA in the resulting Categorical object """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) return _bins_to_cuts_new(x, bins, labels=labels, retbins=retbins, precision=precision, include_lowest=True)
def qcut(x, q=4, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce 1000 integers from 0 to 9 indicating the Parameters ---------- x : ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. Array of quantiles must span [0, 1] labels : array or boolean, default None Labels to use for bin edges, or False to return integer bin labels retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. Returns ------- Notes ----- Examples -------- """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) bins[0] -= 0.001 * (x.max() - x.min()) return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, precision=precision)
def qcut(x, q, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce 1000 integers from 0 to 9 indicating the Parameters ---------- x : ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None Labels to use for bin edges, or False to return integer bin labels retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. Returns ------- cat : Categorical Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, precision=precision, include_lowest=True)
def qcut( x, q, labels=None, retbins: bool = False, precision: int = 3, duplicates: str = "raise", ): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : 1d ndarray or Series q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. precision : int, optional The precision at which to store and display the bins labels. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. Returns ------- out : Categorical or Series or array of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if input is a Series else Categorical. Bins are represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- >>> pd.qcut(range(5), 4) ... # doctest: +ELLIPSIS [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3]) """ original = x x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts( x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates, ) return _postprocess_for_cut(fac, bins, retbins, dtype, original)
def autoBinarize(self): self.MinValue = self.data[self.var_name].min() self.AvgValue = self.data[self.var_name].mean() self.MedianValue = self.data[self.var_name].median() self.MaxValue = self.data[self.var_name].max() justmiss = self.data.loc[self.data[self.var_name].isnull(), [self.var_name, self.target]] notmiss = self.data.loc[self.data[self.var_name].notnull(), [self.var_name, self.target]] r = 0 n = 20 best_r = 0 best_n = 0 if (notmiss.shape[0] < self.Total * 0.005): # non-empty records less than 0.5% of Total d1 = pd.DataFrame({ "X": notmiss[self.var_name], "Y": notmiss[self.target], "Bucket": pd.qcut(notmiss[self.var_name], 1, duplicates='drop') }) d2 = d1.groupby('Bucket', as_index=True) else: while ((np.abs(r) < 0.99999) and (n > 0)): try: d1 = pd.DataFrame({ "X": notmiss[self.var_name], "Y": notmiss[self.target], "Bucket": pd.qcut(notmiss[self.var_name], n, duplicates='drop') }) d2 = d1.groupby('Bucket', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) if np.abs(r) > np.abs(best_r): best_r = r best_n = n n = n - 1 except Exception as e: print("Exception for variable %s step n = %i: %s" % (self.var_name, n, e)) n = n - 1 if len(d2) == 1: try: n = best_n bins = algos.quantile(notmiss[self.var_name], np.linspace(0, 1, n)) if len(np.unique(bins)) == 2: bins = np.insert(bins, 0, 1) bins[1] = bins[1] - (bins[1] / 2) d1 = pd.DataFrame({ "X": notmiss[self.var_name], "Y": notmiss[self.target], "Bucket": pd.cut(notmiss[self.var_name], np.unique(bins), include_lowest=True) }) d2 = d1.groupby('Bucket', as_index=True) except Exception as e: print("Exception for variable %s step n = %i: %s" % (self.var_name, n, e)) d1 = pd.DataFrame({ "X": notmiss[self.var_name], "Y": notmiss[self.target], "Bucket": pd.qcut(notmiss[self.var_name], 1, duplicates='drop') }) d2 = d1.groupby('Bucket', as_index=True) self.intervals['Variable'] = self.var_name self.intervals['MinValue'] = d2.min().X self.intervals['MaxValue'] = d2.max().X self.intervals['Interval'] = [ ' - '.join(str(x) for x in y) for y in map( tuple, self.intervals[['MinValue', 'MaxValue']].values) ] self.intervals['Total'] = d2.count().Y self.intervals['Bads'] = d2.sum().Y self.intervals.loc[np.isnan(self.intervals['Bads']), 'Bads'] = 0 self.intervals['Goods'] = d2.count().Y - d2.sum().Y self.intervals.loc[np.isnan(self.intervals['Goods']), 'Goods'] = 0 if len(justmiss.index) > 0: d4 = pd.DataFrame({'MinValue': np.nan}, index=[0]) d4['MaxValue'] = np.nan d4['Interval'] = "Missing Value" d4['Total'] = justmiss.count()[self.target] d4['Bads'] = justmiss.sum()[self.target] d4['Goods'] = justmiss.count()[self.target] - justmiss.sum()[ self.target] self.intervals = self.intervals.append(d4, ignore_index=True, sort=True) # Here has to be the common code from Base class BinVariable.autoBinarize(self)
def mono_bin(Y, X, max_bin, force_bin): """ binning function for int and float type variables, and not binary indicator variable Parameters ---------- Y : pandas series target vector X : pandas dataframe training dataset max_bin : int the maximum number of bins (categories) for numeric variable binning. force_bin : int For some numeric variables, the mono_bin function may produce only one category while binning. ‘force_bin’ ensures that at least produces two categories will be produced. Return ------ d3 : pandas dataframe Weight of evidence / information value table and other data used to calculate WOE and IV for variable i in dataset """ n = max_bin df1 = pd.DataFrame({"X": X, "Y": Y}) justmiss = df1[['X', 'Y']][df1.X.isnull()] notmiss = df1[['X', 'Y']][df1.X.notnull()] r = 0 while np.abs(r) < 1: try: d1 = pd.DataFrame({ "X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n) }) d2 = d1.groupby('Bucket', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) n = n - 1 except Exception as e: n = n - 1 if len(d2) == 1: n = force_bin bins = algos.quantile(notmiss.X, np.linspace(0, 1, n)) if len(np.unique(bins)) == 2: bins = np.insert(bins, 0, 1) bins[1] = bins[1] - (bins[1] / 2) d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": \ pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) d2 = d1.groupby('Bucket', as_index=True) d3 = pd.DataFrame({}, index=[]) d3["MIN_VALUE"] = d2.min().X d3["MAX_VALUE"] = d2.max().X print(d3) d3["COUNT"] = d2.count().Y d3["EVENT"] = d2.sum().Y d3["NONEVENT"] = d2.count().Y - d2.sum().Y d3 = d3.reset_index(drop=True) d3 = d3.drop(d3[d3.COUNT == 0].index) if len(justmiss.index) > 0: d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0]) d4["MAX_VALUE"] = np.nan d4["COUNT"] = justmiss.count().Y d4["EVENT"] = justmiss.sum().Y d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y d3 = d3.append(d4, ignore_index=True) d3["EVENT_RATE"] = d3.EVENT / d3.COUNT d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.COUNT d3["DIST_EVENT"] = d3.EVENT / d3.sum().EVENT d3["DIST_NON_EVENT"] = d3.NONEVENT / d3.sum().NONEVENT d3["WOE"] = np.log(d3.DIST_NON_EVENT / d3.DIST_EVENT) d3["IV"] = (d3.DIST_NON_EVENT - d3.DIST_EVENT) * np.log( d3.DIST_NON_EVENT / d3.DIST_EVENT) d3["VAR_NAME"] = "VAR" d3['max_range'] = d3['MAX_VALUE'] d3["min_range"] = d3.groupby('VAR_NAME')['MAX_VALUE'].shift(1) d3.loc[d3['min_range'].isnull(), 'min_range'] = -np.inf d3.loc[d3['MIN_VALUE'].isnull(), 'min_range'] = np.nan _max = d3.loc[d3['max_range'] != np.nan, 'max_range'].max() d3.loc[d3['max_range'] == _max, 'max_range'] = np.inf d3['bucket'] = '(' + d3['min_range'].astype( str) + ', ' + d3['max_range'].astype(str) + ']' d3.loc[d3['bucket'] == '(nan, nan]', 'bucket'] = 'missing' d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'min_range', 'max_range', 'COUNT', 'EVENT', 'EVENT_RATE', \ 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT', 'bucket', 'WOE', 'IV']] d3 = d3.replace([np.inf, -np.inf], 0) d3.IV = d3.IV.sum() return (d3)
def finger(X, y=None, cut_points=None, n_quantiles=4, labels=None, min_val=None, max_val=None, **params): """Manually bins continuous variable into the declared intervals. If the cut-off points are not declared the split is made using quantiles. Parameters ---------- X: array-like, shape = (n_samples, ) Vector passed as an one-dimensional array-like object where n_samples in the number of samples. y: Ignore cut_points: array-like, optional (default=None) Increasing monotonic sequence generating right-closed intervals. Values not allocated to any of the categories will be assigned to the empty set. For example given: cut_points=[1, 5, 9] will generate intervals: [X.min(), 1], (1, 5], (5, 9], (9, X.max()]. If you want to specify lower and upper limitations, set parameters: "min_val", "max_val" to a specific value. n_quantiles: int, optional (default=4) When cut_points are not declared it sets the number of quantiles to which the variable will be splitted. For example setting n_quantiles = 4 will return quartiles of X values between min_val and max_val. labels: string: {'auto'} or list, optional (default=None) Specifies returned bucket names, needs to be the same length as the number of created buckets: - `auto`: Assigns default values to group names by numbering them. min_val: float, optional (default=None) Determines lower limit value. If not specified takes -np.inf. max_val: float, optional (default=None) Determines upper limit value. If not specified takes np.inf. Returns ------- X_new: array, shape = (n_samples, ) Input data with its original values being substituted with their respective labels. """ X = np.asarray(X) x = X[~np.isnan(X)] if min_val is None: min_val = -np.inf if max_val is None: max_val = np.inf # Default break_points in case of no declaration of cut_points if cut_points is None: x = x[(x >= min_val) & (x <= max_val)] break_points = algos.quantile( np.unique(x), np.linspace(0, 1, n_quantiles + 1) ) else: break_points = np.insert( cut_points.astype(float), [0, len(cut_points)], [min_val, max_val] ) break_points = np.unique(break_points) if labels == 'auto': labels = range(len(break_points) - 1) X_new = pd.cut( X, bins=break_points, labels=labels, include_lowest=True ) return X_new
meta = df.describe() meta.to_excel( r'C:\Users\wanti\Desktop\MMA\MMA 831 Marketing Analytics\MMA831_midterm\meta2.xlsx', index=False) ########################### # generate logit plot # logit plot for i in range(1, 100): # find the right size to bin x test = df.loc[:, [df.columns[i], 'target2']] test = test.dropna() bins = np.unique( algos.quantile(test.loc[:, df.columns[i]], np.linspace(0, 1, 11))) test['bin'] = pd.cut(test.loc[:, df.columns[i]], bins, right=False) # using bin to get logit y tt = test.groupby(['bin'], as_index=False).agg({ 'target2': ['count', 'sum'], df.columns[i]: 'mean' }) tt['logity'] = np.log( (tt.iloc[:, 2] + 1) / (tt.iloc[:, 1] - tt.iloc[:, 2] + 1)) # plot it plt.figure() plt.plot(tt.iloc[:, 3], tt['logity'], color='blue') plt.title(df.columns[i] + " vs target") plt.show()
def quantile(x): vals = x.values return algos.quantile(vals, np.linspace(0,1,11))
def show_orders_hist(df, feature_columns, show=True, only_hist=True, show_pie=False): """ 可视化统计feature_columns序列所指定的特征在df中的直方图或者饼状图, 根据参数only_hist是否进行pd.qcut统计输出 eg: from abupy import AbuML, ml ttn_raw = AbuML.load_ttn_raw_df() ml.show_orders_hist(ttn_raw, ['Age', 'Fare', 'Pclass']) :param df: pd.DataFrame对象 :param feature_columns: 特征名称序列,eg:['Age', 'Fare', 'Pclass'] :param show: 是否可视化直方图或者饼状图 :param show_pie: 是否优先考虑绘制饼状图,默认false :param only_hist: 是否进行pd.qcut统计输出 """ if not isinstance(df, pd.DataFrame): logging.info('df must pd.DataFrame, not type {}'.format(type(df))) return # 第一步过滤不在在特征列中的feature_columns元素 feature_columns = list( filter(lambda x: df.columns.tolist().count(x) > 0, feature_columns)) # 第二步过滤feature_columns元素中类型不是int或者float的 feature_columns = list( filter( lambda x: df[x].dtype == int or df[x].dtype == float or df[x].dtype == np.uint or df[x].dtype == np.uint8, feature_columns)) # 第三步过滤feature_columns元素中所指特征列中unique==1的,eg:1列全是1,全是0,没办法做bin feature_columns = list( filter(lambda x: len(np.unique(df[x])) > 1, feature_columns)) axs_list = None if len(feature_columns) == 0: # 晒没了的情况,直接返回 logging.info( '{}\n{}\nnot exist! or unique==1!, or dtype != int or float'. format(df.columns, df.dtypes)) return if show: # 如果可视化直方图,先确定子画布列数,一行放两个,取math.ceil,eg:3 /2 = 2 n_rows = int(math.ceil(len(feature_columns) / 2)) # 行高取5,总高度:n_rows * 5 fig_h = n_rows * 5 # plt.subplots生成子画布 _, axs = plt.subplots(nrows=n_rows, ncols=2, figsize=(14, fig_h)) # 如果是多于1个的即展开字画本序列为1d序列 axs_list = axs if n_rows == 1 else list( itertools.chain.from_iterable(axs)) for ind, feature in enumerate(feature_columns): feature_unique = len(np.unique(df[feature])) ax = None if axs_list is not None: ax = axs_list[ind] ax.set_title(feature) if show_pie and feature_unique < 10: # 如果特征的值unique < 10个,通过value_counts直接画饼图 df[feature].value_counts().plot(ax=ax, kind='pie') else: # 画直方图 bins = int(feature_unique / 50) if feature_unique / 50 > 10 else 10 df[feature].hist(ax=ax, bins=bins) if only_hist: # 只做可视化就continue continue try: # qcut切分10等份 cats = pd.qcut(df[feature], 10) except Exception: # 某一个数据超出q的数量导致无法分 import pandas.core.algorithms as algos bins = algos.quantile(np.unique(df[feature]), np.linspace(0, 1, 10 + 1)) # noinspection PyProtectedMember,PyUnresolvedReferences cats = pd.tools.tile._bins_to_cuts(df[feature], bins, include_lowest=True) logging.info('{0} show hist and qcuts'.format(feature)) """ Age show hist and qcuts (31.8, 36] 91 (14, 19] 87 (41, 50] 78 [0.42, 14] 77 (22, 25] 70 (19, 22] 67 (28, 31.8] 66 (50, 80] 64 (25, 28] 61 (36, 41] 53 Name: Age, dtype: int64 """ logging.info(cats.value_counts())
# to generate logit plot, let's create binayr target first df['target2'] = np.where(df['target'] > 0, 1, 0) df['target2'].value_counts() df['target'].value_counts() # logit plot for i in range(1,100): # find the right size to bin x test = df.loc[:,[df.columns[i],'target2']] bins = np.unique(algos.quantile(test.loc[:,df.columns[i]], np.linspace(0, 1, 11))) test['bin'] = pd.cut(test.loc[:,df.columns[i]], bins ,right=False) # using bin to get logit y tt = test.groupby(['bin'],as_index= False).agg({ 'target2':['count','sum'], df.columns[i]:'mean'}) tt['logity'] = np.log((tt.iloc[:,2] + 1)/(tt.iloc[:,1] -tt.iloc[:,2] + 1)) # plot it plt.figure() plt.plot(tt.iloc[:,3],tt['logity'],color='blue') plt.title(df.columns[i]+" vs target") plt.show()
def bucket_data(df, buckets, label=None, privacy=None, verbose=0, bin_features=None): df = df.copy() # partition continuous and integer data into buckets for col in df.columns: do_buckets = col != label and ( df[col].dtype == 'float64' or ( df[col].dtype == 'int64' and len(set(df[col])) > buckets) ) and (bin_features is None or col in bin_features) if not do_buckets: continue if verbose >= 2: print 'bucketing column', repr(col) arr = np.nan_to_num(df[col].as_matrix()) # this is here to mask out zeros, in case the majority of values are # zeros and it's impossible to do normal bucketing #mx = np.ma.masked_equal(arr, 0, copy=True) #bins = algos.quantile(arr[~mx.mask], np.linspace(0, 1, buckets+1)) # then add back in a bucket specifically for zeros #bins = np.insert(bins, 0, 0) #bins[1] = bins[1] - bins[1] / 2 epsilon = 1e-10 if df[col].dtype == 'float64' else 1 bins = algos.quantile(arr, np.linspace(0, 1, buckets+1)) if privacy is not None and privacy > 0: assert buckets == 2 median = estimate_median_private(arr, privacy, min(arr), max(arr)) bins = np.array([0, median, max(arr)]) if verbose >= 2: print 'median real', sorted(arr)[len(arr)/2], 'estimate', median for i in range(1, len(bins)): if bins[i] <= bins[i - 1]: bins[i] = bins[i - 1] + epsilon df[col] = pd.cut(arr, bins, labels=range(buckets), include_lowest=True) continue # tools.tile gone as of pandas 16 :( #cuts = pd.tools.tile._bins_to_cuts(arr, bins, labels=range(buckets), #include_lowest=True) #df[col] = pd.qcut(df[col], buckets, labels=range(buckets)) # sample values until you get enough real ones. This doesn't work if # there are too many "NaN"s. #sample = df.sample(n=int(math.sqrt(len(df[col])) + 1))[col].copy() sample = df[col].sort_values(inplace=False, na_position='last') num_num = len(sample.dropna(inplace=False)) n = float(num_num) / buckets # these are the percentiles of the numbers in the series - dictating # the boundaries of the buckets. # we only do this convoluted thing here to support the sampling step # above. Otherwise we would just sort everything and put elements # [n:i+n] into each bucket. bucket_list = [sample.iloc[int(i*n)] for i in range(1, buckets)] if False: # simple method bucket_list = sorted(set(bucket_list)) for i, row in df.iterrows(): if np.isnan(row[col]): val = 0 else: val = next((idx for idx, b in enumerate(bucket_list) if b >= row[col]), len(bucket_list)) df.set_value(i, col, val) print 'Bucket values for %s:' % col print '\tv <= %.3f' % bucket_list[0] for i in range(len(bucket_list) - 1): v = bucket_list[i] nv = bucket_list[i+1] print '\t%.3f < v <= %.3f' % (v, nv) print '\tv > %.3f' % bucket_list[-1] print else: # more complicated method bucket_vals = {} exact_vals = {} idx = 0 for v in sorted(list(set(bucket_list))): idx += 1 bucket_vals[v] = idx if bucket_list.count(v) > 1: idx += 1 exact_vals[v] = idx sorted_vals = sorted(bucket_vals.items()) print 'Bucket values for %s:' % col print '\tv <= %.3f' % sorted_vals[0][0] for i in range(len(sorted_vals) - 1): v = sorted_vals[i][0] nv = sorted_vals[i+1][0] if v in exact_vals: print '\tv == %.3f' % v if nv in exact_vals: print '\t%.3f < v < %.3f' % (v, nv) else: print '\t%.3f < v <= %.3f' % (v, nv) print '\tv > %.3f' % sorted_vals[-1][0] print num_buckets = len(bucket_vals) + len(exact_vals) for i, row in df.iterrows(): if np.isnan(row[col]): val = 0 elif row[col] in exact_vals: val = exact_vals[row[col]] else: val = next((idx for b, idx in sorted_vals if b >= row[col]), num_buckets) df.set_value(i, col, val) df[col] = df[col].astype(int) return df
def test_quantile(): s = Series(np.random.randn(100)) result = algos.quantile(s, [0, .25, .5, .75, 1.]) expected = algos.quantile(s.values, [0, .25, .5, .75, 1.]) tm.assert_almost_equal(result, expected)
def mono_bin(Y, X, n=max_bin): df1 = pd.DataFrame({"X": X, "Y": Y}) justmiss = df1[['X', 'Y']][df1.X.isnull()] notmiss = df1[['X', 'Y']][df1.X.notnull()] r = 0 while np.abs(r) < 1: try: d1 = pd.DataFrame({ "X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n) }) d2 = d1.groupby('Bucket', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) n = n - 1 # print(r) except Exception as e: n = n - 1 if len(d2) == 1: n = force_bin bins = algos.quantile(notmiss.X, np.linspace(0, 1, n)) if len(np.unique(bins)) == 2: bins = np.insert(bins, 0, 1) bins[1] = bins[1] - (bins[1] / 2) d1 = pd.DataFrame({ "X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins), include_lowest=True) }) d2 = d1.groupby('Bucket', as_index=True) d3 = pd.DataFrame({}, index=[]) d3["MIN_VALUE"] = d2.min().X d3["MAX_VALUE"] = d2.max().X d3["COUNT"] = d2.count().Y d3["EVENT"] = d2.sum().Y d3["NONEVENT"] = d2.count().Y - d2.sum().Y d3 = d3.reset_index(drop=True) if len(justmiss.index) > 0: d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0]) d4["MAX_VALUE"] = np.nan d4["COUNT"] = justmiss.count().Y d4["EVENT"] = justmiss.sum().Y d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y d3 = d3.append(d4, ignore_index=True) d3["EVENT_RATE"] = d3.EVENT / d3.sum().EVENT d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.sum().NONEVENT d3["WOE"] = np.log(d3.EVENT_RATE / d3.NON_EVENT_RATE) d3["IV"] = (d3.EVENT_RATE - d3.NON_EVENT_RATE) * np.log( d3.EVENT_RATE / d3.NON_EVENT_RATE) d3["VAR_NAME"] = "VAR" d3 = d3[[ 'VAR_NAME', 'MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'WOE', 'IV' ]] d3 = d3.replace([np.inf, -np.inf], 0) d3.IV = d3.IV.sum() return (d3)
def show_orders_hist(df, feature_columns, show=True, only_hist=True, show_pie=False): """ 可视化统计feature_columns序列所指定的特征在df中的直方图或者饼状图, 根据参数only_hist是否进行pd.qcut统计输出 eg: from abupy import AbuML, ml ttn_raw = AbuML.load_ttn_raw_df() ml.show_orders_hist(ttn_raw, ['Age', 'Fare', 'Pclass']) :param df: pd.DataFrame对象 :param feature_columns: 特征名称序列,eg:['Age', 'Fare', 'Pclass'] :param show: 是否可视化直方图或者饼状图 :param show_pie: 是否优先考虑绘制饼状图,默认false :param only_hist: 是否进行pd.qcut统计输出 """ if not isinstance(df, pd.DataFrame): logging.info('df must pd.DataFrame, not type {}'.format(type(df))) return # 第一步过滤不在在特征列中的feature_columns元素 feature_columns = list(filter(lambda x: df.columns.tolist().count(x) > 0, feature_columns)) # 第二步过滤feature_columns元素中类型不是int或者float的 feature_columns = list( filter( lambda x: df[x].dtype == int or df[x].dtype == float or df[x].dtype == np.uint or df[x].dtype == np.uint8, feature_columns)) # 第三步过滤feature_columns元素中所指特征列中unique==1的,eg:1列全是1,全是0,没办法做bin feature_columns = list(filter(lambda x: len(np.unique(df[x])) > 1, feature_columns)) axs_list = None if len(feature_columns) == 0: # 晒没了的情况,直接返回 logging.info('{}\n{}\nnot exist! or unique==1!, or dtype != int or float'.format( df.columns, df.dtypes)) return if show: # 如果可视化直方图,先确定子画布列数,一行放两个,取math.ceil,eg:3 /2 = 2 n_rows = int(math.ceil(len(feature_columns) / 2)) # 行高取5,总高度:n_rows * 5 fig_h = n_rows * 5 # plt.subplots生成子画布 _, axs = plt.subplots(nrows=n_rows, ncols=2, figsize=(14, fig_h)) # 如果是多于1个的即展开字画本序列为1d序列 axs_list = axs if n_rows == 1 else list(itertools.chain.from_iterable(axs)) for ind, feature in enumerate(feature_columns): feature_unique = len(np.unique(df[feature])) ax = None if axs_list is not None: ax = axs_list[ind] ax.set_title(feature) if show_pie and feature_unique < 10: # 如果特征的值unique < 10个,通过value_counts直接画饼图 df[feature].value_counts().plot(ax=ax, kind='pie') else: # 画直方图 bins = int(feature_unique / 50) if feature_unique / 50 > 10 else 10 df[feature].hist(ax=ax, bins=bins) if only_hist: # 只做可视化就continue continue try: # qcut切分10等份 cats = pd.qcut(df[feature], 10) except Exception: # 某一个数据超出q的数量导致无法分 import pandas.core.algorithms as algos bins = algos.quantile(np.unique(df[feature]), np.linspace(0, 1, 10 + 1)) # noinspection PyProtectedMember,PyUnresolvedReferences cats = pd.tools.tile._bins_to_cuts(df[feature], bins, include_lowest=True) logging.info('{0} show hist and qcuts'.format(feature)) """ Age show hist and qcuts (31.8, 36] 91 (14, 19] 87 (41, 50] 78 [0.42, 14] 77 (22, 25] 70 (19, 22] 67 (28, 31.8] 66 (50, 80] 64 (25, 28] 61 (36, 41] 53 Name: Age, dtype: int64 """ logging.info(cats.value_counts())
def resample_by_magnitude(input_shapefile, output_shapefile, target_field, bins=10, fields_to_keep=[], bootstrap=True, output_samples=None, validation_file=None, validation_points=100): """ Parameters ---------- input_shapefile: str output_shapefile: str target_field: str target field name based on which resampling is performed. Field must exist in the input_shapefile bins: int number of bins for sampling fields_to_keep: list of strings to store in the output shapefile bootstrap: bool, optional whether to sample with replacement or not output_samples: int, optional number of samples in the output shpfile. If not provided, the output samples will be assumed to be the same as the original shapefile validation_file: str, optional validation file name validation_points: int, optional approximate number of points in the validation shapefile Returns ------- """ log.info("resampling shapefile by values") if bootstrap and validation_file: raise ValueError('bootstrapping should not be use while' 'creating a validation shapefile.') if len(fields_to_keep): fields_to_keep.append(target_field) else: fields_to_keep = [target_field] gdf_out = filter_fields(fields_to_keep, input_shapefile) # the idea is stolen from pandas.qcut # pd.qcut does not work for cases when it result in non-unique bin edges target = gdf_out[target_field].values bin_edges = algos.quantile(np.unique(target), np.linspace(0, 1, bins + 1)) result = pd.tools.tile._bins_to_cuts(target, bin_edges, labels=False, include_lowest=True) # add to output df for sampling gdf_out[BIN] = result dfs_to_concat = [] validation_dfs_to_concat = [] total_samples = output_samples if output_samples else gdf_out.shape[0] samples_per_bin = total_samples // bins validate_array = np.ones(bins, dtype=np.bool) if validation_file and bins > validation_points: validate_array[validation_points:] = False np.random.shuffle(validate_array) gb = gdf_out.groupby(BIN) for i, (b, gr) in enumerate(gb): if bootstrap: dfs_to_concat.append( gr.sample(n=samples_per_bin, replace=bootstrap)) else: _df, v_df = _sample_without_replacement(gr, samples_per_bin, validate_array[i]) dfs_to_concat.append(_df) validation_dfs_to_concat.append(v_df) final_df = pd.concat(dfs_to_concat) final_df.sort_index(inplace=True) final_df.drop(BIN, axis=1).to_file(output_shapefile) if validation_file: validation_df = pd.concat(validation_dfs_to_concat) validation_df.to_file(validation_file) log.info('Wrote validation shapefile {}'.format(validation_file)) return output_shapefile
def resample_by_magnitude(input_data, target_field, bins=10, interval='percentile', fields_to_keep=[], bootstrap=True, output_samples=None, validation=False, validation_points=100): """ Parameters ---------- input_gdf : geopandas.GeoDataFrame Geopandas dataframe containing targets to be resampled. target_field : str target field name based on which resampling is performed. Field must exist in the input_shapefile bins : int number of bins for sampling fields_to_keep : list of strings to store in the output shapefile bootstrap : bool, optional whether to sample with replacement or not output_samples : int, optional number of samples in the output shpfile. If not provided, the output samples will be assumed to be the same as the original shapefile validation : bool, optional validation file name validation_points : int, optional approximate number of points in the validation shapefile Returns ------- """ if bootstrap and validation: raise ValueError('bootstrapping should not be use while' 'creating a validation shapefile.') if interval not in ['percentile', 'linear']: _logger.warning( "Interval method '{}' not recognised, defaulting to 'percentile'". format(interval)) interval = 'percentile' if len(fields_to_keep): fields_to_keep.append(target_field) else: fields_to_keep = [target_field] gdf_out = prepapre_dataframe(input_data, fields_to_keep) # the idea is stolen from pandas.qcut # pd.qcut does not work for cases when it result in non-unique bin edges target = gdf_out[target_field].values if interval == 'percentile': bin_edges = algos.quantile(np.unique(target), np.linspace(0, 1, bins + 1)) elif interval == 'linear': bin_edges = np.linspace(np.min(target), np.max(target), bins + 1) result = pd.core.reshape.tile._bins_to_cuts(target, bin_edges, labels=False, include_lowest=True) # add to output df for sampling gdf_out[BIN] = result[0] dfs_to_concat = [] validation_dfs_to_concat = [] total_samples = output_samples if output_samples else gdf_out.shape[0] samples_per_bin = total_samples // bins validate_array = np.ones(bins, dtype=np.bool) if validation and bins > validation_points: validate_array[validation_points:] = False np.random.shuffle(validate_array) gb = gdf_out.groupby(BIN) for i, (b, gr) in enumerate(gb): if bootstrap: dfs_to_concat.append( gr.sample(n=samples_per_bin, replace=bootstrap)) else: _df, v_df = _sample_without_replacement(gr, samples_per_bin, validate_array[i]) dfs_to_concat.append(_df) validation_dfs_to_concat.append(v_df) final_df = pd.concat(dfs_to_concat) final_df.sort_index(inplace=True) output_gdf = final_df.drop(BIN, axis=1) if validation: validation_df = pd.concat(validation_dfs_to_concat) return output_gdf, validation_df else: return output_gdf