def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system testpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\test.csv' trainpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\train.csv' print trainpath traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv(r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\prediction.csv', index=False) return [traindf, testdf]
def logsums(name, dir_name): # logsum = 'CFULL/SHO' logsum_output = 'outputs/grouped/logsums.csv' df = pd.read_csv(os.path.join(dir_name, 'aggregate_logsums.1.dat'), delim_whitespace=True, skipinitialspace=True) df = df.reset_index() df = pd.DataFrame(df[['level_0',logsum]]) df['source'] = name # Separate into accessibility bins df['accessibility'] = pd.qcut(df[logsum],5,labels=['lowest','low','moderate','high','highest']) bins = pd.qcut(df[logsum],5,retbins=True)[1] df.columns = ['taz','logsum','source','accessibility'] # Attach population hh = pd.read_csv(os.path.join(dir_name,'_household.tsv'), sep='\t') df_pop = pd.DataFrame(hh.groupby('hhtaz').sum()['hhsize']) df_pop['taz'] = df_pop.index df = pd.merge(df,df_pop,on='taz',how='left') df.columns = [['taz','logsum','source','accessibility','population']] # Write to file if os.path.exists(logsum_output): df_current = pd.read_csv(logsum_output) df_current.append(df).to_csv(logsum_output, index=False) else: df.to_csv(logsum_output, index=False)
def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system trainpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtrain.csv' testpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtest.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/prediction.csv', index=False) return [traindf, testdf]
def slide_14(): ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) print cats # labels じゃなくて codes を使え # print cats.labels print cats.codes # print cats.levels # levels じゃなくて categories を使え print cats.categories print pd.value_counts(cats) print pd.cut(ages, [18, 26, 36, 61, 100], right=False) group_names = ['Youth', 'YoungAdultl', 'MiddleAged', 'Senior'] print pd.cut(ages, bins, labels=group_names) data = np.random.rand(20) print data print pd.cut(data, 3, precision=2) data = np.random.randn(1000) cats = pd.qcut(data, 3) print cats print pd.value_counts(cats) print pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system trainpath = '../../data/train.csv' testpath = '../../data/test.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age+jitter(traindf.Age), no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('./prediction.csv', index=False) return [traindf, testdf]
def create_figure(): xs = df[x.value].values ys = df[y.value].values x_title = x.value.title() y_title = y.value.title() kw = dict() if x.value in discrete: kw['x_range'] = sorted(set(xs)) if y.value in discrete: kw['y_range'] = sorted(set(ys)) kw['title'] = "%s vs %s" % (x_title, y_title) p = figure(plot_height=600, plot_width=800, tools='pan,box_zoom,reset', **kw) p.xaxis.axis_label = x_title p.yaxis.axis_label = y_title if x.value in discrete: p.xaxis.major_label_orientation = pd.np.pi / 4 sz = 9 if size.value != 'None': groups = pd.qcut(df[size.value].values, len(SIZES)) sz = [SIZES[xx] for xx in groups.codes] c = "#31AADE" if color.value != 'None': groups = pd.qcut(df[color.value].values, len(COLORS)) c = [COLORS[xx] for xx in groups.codes] p.circle(x=xs, y=ys, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5) show(p) return p
def preproc_households(store): df = store['households'] df['tenure'] = df.hownrent.map({1: 'own', 2: 'rent'}) # need to keep track of base year income quartiles for use in the # transition model - even caching doesn't work because when you add # rows via the transitioning, you automatically clear the cache! # this is pretty nasty and unfortunate df["base_income_quartile"] = pd.Series(pd.qcut(df.income, 4, labels=False), index=df.index).add(1) df["base_income_octile"] = pd.Series(pd.qcut(df.income, 8, labels=False), index=df.index).add(1) # there are some overrides where we move households around in order # to match the city totals - in the future we will resynthesize and this # can go away - this csv is generated by scripts/match_city_totals.py overrides = pd.read_csv("data/household_building_id_overrides.csv", index_col="household_id").building_id df.loc[overrides.index, "building_id"] = overrides.values # turns out we need 4 more households new_households = df.loc[[1132542, 1306618, 950630, 886585]].reset_index() # keep unique index new_households.index += pd.Series(df.index).max() + 1 df = df.append(new_households) store['households_preproc'] = df
def cleaneddf(no_bins=0): trainpath = 'Titanic/train.csv' testpath = 'Titanic/test.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins == 0: return [cleandf(traindf), cleandf(testdf)] traindf = cleandf(traindf) testdf = cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins = True) bins = bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discrete age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins = True) bins = bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('Titanic/prediction.csv', index = False) return [traindf, testdf]
def performBinning(self, x): # Assign initial value to entropy and best number of bins bestEntropy = 1.0 best = 0 for i in bins: try: data2 = [x, self.df['TARGET']] data = pd.concat(data2, axis=1) try: data['binned'] = pd.qcut(data.ix[:,0], i, labels=False) # In case there is no differenciation except: data['binned'] = data.ix[:,0] bindf = pd.DataFrame(index=range(round(float(data.shape[0])/(i+1))), columns=range(i)) bindf = bindf.fillna(0) entropyList = [] total = data.shape[0] for j in range(i): sumTarget = data[data['binned']==j].ix[:,1].sum() prob = sumTarget /total # Applying entropy function entropyList.append(self.calculateEntropy(prob)) totEntropy= 0 # Calculating total Entropy for j in entropyList: totEntropy = totEntropy + (j/len(entropyList)) # Checking if new entropy is lower than the previous one if totEntropy < bestEntropy: print(totEntropy) bestEntropy = totEntropy best = i print(best) else: break except: break global binned binned[list(data.columns.values)[0]] = (pd.qcut(data.ix[:, 0], best, labels=False))
def test_qcut_duplicates_bin(kwargs, msg): # see gh-7751 values = [0, 0, 0, 0, 1, 2, 3] if msg is not None: with pytest.raises(ValueError, match=msg): qcut(values, 3, **kwargs) else: result = qcut(values, 3, **kwargs) expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) tm.assert_index_equal(result.categories, expected)
def CalcBinReturns(self, dfSignalReturns): q = pd.qcut(dfSignalReturns[self.alpha_name], self.bin_num) # use qcut to get which day belongs to which bin tmp = dfSignalReturns.copy() tmp['Bin'] = q.values.codes + 1 topQ = tmp[tmp['Bin'].apply(lambda x: x >=self.alpha_range_lower and x <= self.alpha_range_higher)].copy() qFilter = pd.qcut(topQ[self.filter_name], 100) topQ['FilterBin'] = qFilter.values.codes + 1 group = topQ.groupby('FilterBin') QuantileReturns = group.mean() QuantileReturns['RetBps'] = QuantileReturns[self.return_name] * 10000 QuantileReturns['Labels'] = Series(index=xrange(1,self.filter_bin_num+1), data=qFilter.values.categories) return QuantileReturns
def households(store, settings): # start with households from urbansim_defaults df = datasources.households(store, settings) # need to keep track of base year income quartiles for use in the # transition model - even caching doesn't work because when you add # rows via the transitioning, you automatically clear the cache! # this is pretty nasty and unfortunate df["base_income_quartile"] = pd.Series(pd.qcut(df.income, 4, labels=False), index=df.index).add(1) df["base_income_octile"] = pd.Series(pd.qcut(df.income, 8, labels=False), index=df.index).add(1) return df
def processAge(): global df setMissingAges() # center the mean and scale to unit variance if keep_scaled: scaler = preprocessing.StandardScaler() df['Age_scaled'] = scaler.fit_transform(df['Age']) # have a feature for children df['isChild'] = np.where(df.Age < 13, 1, 0) # bin into quartiles and create binary features df['Age_bin'] = pd.qcut(df['Age'], 4) if keep_binary: df = pd.concat([df, pd.get_dummies(df['Age_bin']).rename(columns=lambda x: 'Age_' + str(x))], axis=1) if keep_bins: df['Age_bin_id'] = pd.factorize(df['Age_bin'])[0]+1 if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df['Age_bin_id_scaled'] = scaler.fit_transform(df['Age_bin_id']) if not keep_strings: df.drop('Age_bin', axis=1, inplace=True)
def bokeh_choropleth(df): """stolen more or less directly from http://bokeh.pydata.org/en/0.11.1/docs/gallery/choropleth.html """ states = bksu.data # map looks sooooo bad with these included for stkey in ['HI', 'AK', 'DC']: try: del states[stkey] except KeyError: pass state_xs = [d['lons'] for (code, d) in states.items()] state_ys = [d['lats'] for (code, d) in states.items()] colors = bkpal.Greens9 colors.reverse() state_colors = [] normcron = (df.corn - df.corn.min()) / df.corn.max() stateind = pd.qcut(df.corn, 6).cat.codes state_colors = [ colors[stateind[df.code == statecode].iloc[0]] for (statecode, d) in states.items() ] p = bkp.figure(title='cron', toolbar_location='left', tools=BOKEH_TOOLS) p.patches( state_xs, state_ys, fill_color=state_colors, fill_alpha=0.7, line_color="#884444", line_width=2, line_alpha=0.3 ) return bke.components(p)
def cutData(self,var,bins): """连续变量离散化1:等数切分数据""" q_var = "q_"+var plot_data = self._data.loc[:,[var]].copy() if (len(plot_data[var].value_counts())>20) & (var not in ['addr_state']):#when group >20, catalog data bin_acc = bins while((q_var in plot_data.columns.tolist())==False): try: plot_data[q_var] = pd.qcut(plot_data[var],bin_acc) except: #print("can't cut into %s groups" %bin_acc) if bin_acc > 1: bin_acc = bin_acc -1 continue else: break if(bin_acc==1): print("can't cut return uncutted data") bins = 1 plot_data[q_var] = plot_data[var].copy() else: print("we have cut into %s groups" %bin_acc) # print(plot_data[q_var].value_counts()) else: #print("catalog number is lower than 20, we do not re-organize data") plot_data[q_var] = plot_data[var].copy() return plot_data
def processFare(): global df # replace missing values as the median fare. Currently the datasets only contain one missing Fare value df['Fare'][ np.isnan(df['Fare']) ] = df['Fare'].median() # zero values cause problems with our division interaction variables so set to 1/10th of the lowest fare df['Fare'][ np.where(df['Fare']==0)[0] ] = df['Fare'][ df['Fare'].nonzero()[0] ].min() / 10 # bin into quintiles for binary features df['Fare_bin'] = pd.qcut(df['Fare'], 4) if keep_binary: df = pd.concat([df, pd.get_dummies(df['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1) if keep_bins: df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]+1 # center and scale the fare to use as a continuous variable if keep_scaled: scaler = preprocessing.StandardScaler() df['Fare_scaled'] = scaler.fit_transform(df['Fare']) if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df['Fare_bin_id_scaled'] = scaler.fit_transform(df['Fare_bin_id']) if not keep_strings: df.drop('Fare_bin', axis=1, inplace=True)
def plot_sites_by_characteristic(dataframe, lat_col, long_col, title=None, char_column=None, bins=None, dataframe2=None, lat_col2=None, long_col2=None): map = Basemap(projection='merc',llcrnrlat=23.5,urcrnrlat=57, llcrnrlon=-140,urcrnrlon=-50,lat_ts=20,resolution='l') map.drawcoastlines(linewidth = 1.25) plt.title(title) if not char_column: lats = dataframe[lat_col] longs = dataframe[long_col] x,y = map(longs.values,lats.values) map.plot(x, y, ls='', marker='o', markersize=4) if char_column: blues = sns.color_palette("Blues", n_colors=bins) dataframe['quantile'] = pd.qcut(dataframe[char_column], bins) grouped = dataframe.groupby('quantile') i= -1 for groupname, groupdata, in grouped: i = i + 1 colors = blues[i] lats = groupdata["lat"] longs = groupdata["long"] x,y = map(longs.values,lats.values) map.plot(x, y, ls='', marker='o', color=colors, markersize=4) plt.hold(True) if lat_col2: lats = dataframe2[lat_col2] longs = dataframe2[long_col2] x,y = map(longs.values,lats.values) map.plot(x, y, ls='', marker='o', markersize=4, color='brown')
def discretize_data(path, data): data_aux = [x[13] for x in data] data_discrete = pd.qcut(data_aux, 3, labels=False) for i, item in enumerate(data): data[i][13] = data_discrete[i] # print item return data
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost): # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ... category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False) # then we could iterate over proteins/cDNAs in these categories ... fivywrel_cat, r20_cat, cost_cat = [],[],[] for cat in range(num_of_quantiles): cds_cai_category = cds_cai_dat[category==cat] protein_length_distro = cds_cai_category['protein'].str.len() # average protein length per quantile as a stability measure ... average_length = protein_length_distro.mean() # total proteins length in quantile for AA freqs calculations ... total_length = protein_length_distro.sum() IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL')) # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ... f_IVYWREL = float(IVYWREL)/float(total_length) # 20-vector for of amino acid composition ... aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length)) # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare) # Akashi ... cost = np.dot(aa_freq_20,vec_cost) # storing info ... fivywrel_cat.append(f_IVYWREL) r20_cat.append(R20) cost_cat.append(cost) #returning ... return (fivywrel_cat,r20_cat,cost_cat)
def run(): num_average_ticks = 12 # v=['B', 'H', 'S'] p=[0.05, 0.9, 0.05] d = pd.DataFrame(DATA[['timestamp', 'last']]) d['returns'] = compute_returns(d['last']) print(d['returns'].head()) print(d['returns'].rolling(window=2, center=False).mean().head()) print(d['returns']) sr_column = 'sharpe_ratio_{}'.format(num_average_ticks) # is to make a forward apply not a backward apply as people usually do. d[sr_column] = pd.rolling_apply(d['returns'][::-1], window=num_average_ticks, func=sharpe_ratio, center=False).fillna(0)[::-1] print(d.tail(100)) labels = ['SELL', 'HOLD', 'BUY'] d['signals'] = pd.qcut(d[sr_column], q=[0, 0.05, 0.95, 1], labels=[0, 1, 2]) print(d.head(100)) print(d['signals'].head(100)) d['signals'].astype(np.float).plot() import matplotlib.pyplot as plt plt.show()
def test_qcut_nat(self, s): # GH 19768 intervals = IntervalIndex.from_tuples( [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) expected = Series(Categorical(intervals, ordered=True)) result = qcut(s, 2) tm.assert_series_equal(result, expected)
def historical(): with open(r"capacityFactor.csv", "wb") as csvfile: spamwriter = csv.writer(csvfile, delimiter=' ', quotechar=' ', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(printG) for element in content: ent = os.listdir("../Data/Production/%s"%element) for en in ent: print en try: data = pd.read_csv("../Data/Production/%s/%s"%(element,en),index_col=0) dato = str(data.columns.values[0]) data[data == 0] = None qs, bins = pd.qcut(data,[.25, .5, .75], retbins=True) print bins[0], bins[1],bins[2] dfList = data[dato].tolist() dato0 = min(dfList, key=lambda x:abs(x-bins[0])) dato1 = min(dfList, key=lambda x:abs(x-bins[1])) dato2 = min(dfList, key=lambda x:abs(x-bins[2])) dato0 = data[data[dato] == dato0].index.tolist() dato1 = data[data[dato] == dato1].index.tolist() dato2 = data[data[dato] == dato2].index.tolist() print dato0, dato1, dato2 #print pd.Series(bins, index=['Production_25', 'Production_50', 'Production_75']) row = str(en[:len(en)-4])+","+str(bins[0])+","+str(bins[1])+","+str(bins[2])+","+str(dato0[0][:4])+","+str(dato1[0][:4])+","+str(dato2[0][:4]) spamwriter.writerow([row]) except (ValueError, IndexError): pass
def show_orders_hist(order_pd, s_list=None, q_default=10): if s_list is None: s_list = ['lowBkCnt', 'atr_std', 'jump_power', 'diff_days', 'wave_score1', 'wave_score2', 'wave_score3', 'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd'] s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list) for sn in s_list: uq = len(np.unique(order_pd[sn])) if uq == 1: continue bins = 10 bins = uq // 50 if uq // 50 > bins else bins order_pd[sn].hist(bins=bins) plt.show() try: cats = pd.qcut(order_pd[sn], q_default) except Exception: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1)) cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True) # ZLog.info(sn + ' qcut except use bins!') ZLog.info('{0} show hist and qcuts'.format(sn)) ZLog.info(cats.value_counts())
def preprocess_damage_types(data, include_qcut_features): """Add damage type features for each quartile. Useful for logistic regression.""" if include_qcut_features: for col in [c for c in data.columns if "Damage" in c]: data[col + "_qcut5"] = pandas.qcut(data[col], 5) return data
def calculatePowerCurveSensitivity(self, dataFrame, power_curve, dataColumn, power_column): dataFrame['Energy MWh'] = (dataFrame[power_column] * (float(self.timeStepInSeconds) / 3600.)).astype('float') from collections import OrderedDict self.sensitivityLabels = OrderedDict([("V Low","#0000ff"), ("Low","#4400bb"), ("Medium","#880088"), ("High","#bb0044"), ("V High","#ff0000")]) #categories to split data into using data_column and colour to plot cutOffForCategories = list(np.arange(0.,1.,1./len(self.sensitivityLabels.keys()))) + [1.] minCount = len(self.sensitivityLabels.keys()) * 4 #at least 4 data points for each category for a ws bin to be valid wsBinnedCount = dataFrame[['Wind Speed Bin', dataColumn]].groupby('Wind Speed Bin').count() validWsBins = wsBinnedCount.index[wsBinnedCount[dataColumn] > minCount] #ws bins that have enough data for the sensitivity analysis dataFrame['Bin'] = np.nan #pre-allocating for wsBin in dataFrame['Wind Speed Bin'].unique(): #within each wind speed bin, bin again by the categorising by sensCol if wsBin in validWsBins: try: filt = dataFrame['Wind Speed Bin'] == wsBin dataFrame.loc[filt,'Bin'] = pd.qcut(dataFrame[dataColumn][filt], cutOffForCategories, labels = self.sensitivityLabels.keys()) except: print "\tCould not categorise data by %s for WS bin %s." % (dataColumn, wsBin) sensitivityResults = dataFrame[[power_column, 'Energy MWh', 'Wind Speed Bin','Bin']].groupby(['Wind Speed Bin','Bin']).agg({power_column: np.mean, 'Energy MWh': np.sum, 'Wind Speed Bin': len}) sensitivityResults['Energy Delta MWh'], sensitivityResults['Power Delta kW'] = np.nan, np.nan #pre-allocate for i in sensitivityResults.index: sensitivityResults.loc[i, 'Power Delta kW'] = sensitivityResults.loc[i, power_column] - power_curve.powerCurveLevels.loc[i[0], power_column] sensitivityResults.loc[i, 'Energy Delta MWh'] = sensitivityResults.loc[i, 'Power Delta kW'] * power_curve.powerCurveLevels.loc[i[0], 'Data Count'] * (float(self.timeStepInSeconds) / 3600.) return sensitivityResults.rename(columns = {'Wind Speed Bin':'Data Count'}), np.abs(sensitivityResults['Energy Delta MWh']).sum() / (power_curve.powerCurveLevels[power_column] * power_curve.powerCurveLevels['Data Count'] * (float(self.timeStepInSeconds) / 3600.)).sum()
def calculateHLStat(obsOutcome, predOutcomeProb): # Break predicted outcome probabilities into deciles predDeciles = pd.qcut(predOutcomeProb, np.arange(0, 1.1, 0.1)) # Pre-allocate onesArray = np.nan * np.ones((10,3)) zerosArray = np.nan * np.zeros((10,3)) # Loop through deciles for group in range(10): # Observation Counts onesCnt = np.sum(obsOutcome[predDeciles.labels == group]) onesArray[group, 0] = onesCnt zerosCnt = np.sum(predDeciles.labels == group) - onesCnt zerosArray[group, 0] = zerosCnt # Predicted Probabilities onesProb = np.sum(predOutcomeProb[predDeciles.labels == group]) onesArray[group, 1] = onesProb zerosProb = np.sum(predDeciles.labels == group) - onesProb zerosArray[group, 1] = zerosProb # Chi-Squared onesChiSquare = (onesCnt - onesProb) ** 2 / onesProb onesArray[group, 2] = onesChiSquare zerosChiSquare = (zerosCnt - zerosProb) ** 2 / zerosProb zerosArray[group, 2] = zerosChiSquare # Chi-Squared Sum and probability chiSquareSum = np.sum(onesArray[:, 2]) + np.sum(zerosArray[:, 2]) chiSquaredof = 8 # dof = g - 2 chiSquareProb = sm.stats.stattools.stats.chisqprob(chiSquareSum, chiSquaredof) return chiSquareSum, chiSquareProb
def bin_residuals(resid, var, bins): ''' Compute average residuals within bins of a variable. Returns a dataframe indexed by the bins, with the bin midpoint, the residual average within the bin, and the confidence interval bounds. ins -- resid, var, bins out -- bin DataFrame ''' from pandas import DataFrame, qcut import NumPy as np # use scipy's binned stat method resid_df = DataFrame({'var': var, 'resid': resid}) resid_df['bins'] = qcut(var, bins) bin_group = resid_df.groupby('bins') bin_df = bin_group['var', 'resid'].mean() bin_df['count'] = bin_group['resid'].count() bin_df['lower_ci'] = -2 * (bin_group['resid'].std() / np.sqrt(bin_group['resid'].count())) bin_df['upper_ci'] = 2 * (bin_group['resid'].std() / np.sqrt(bin_df['count'])) bin_df = bin_df.sort('var') return(bin_df)
def discretize(df, columnIndex, cutMode, numberOfBins): """Performs in-place discretization on a numeric column The function has two modes of operation: discretization and quantiling, using the :func:`pandas.cut` and :func:`pandas.qcut` functions respectively. Args: df (pandas.DataFrame): data frame columnIndex (int): index of column to discretize cutMode (str): 'quantiling' or 'discretization' numberOfBins (int): arg passed directly into pandas.cut() and pandas.qcut() functions """ if (cutMode == "discretization"): if type(numberOfBins) is not int: numberOfBins = numberOfBins.split(',') numberOfBins = map(float, numberOfBins) df[df.columns[columnIndex]] = pd.cut(df[df.columns[columnIndex]], numberOfBins).astype(str) elif (cutMode == "quantiling"): if type(numberOfBins) is not int: numberOfBins = numberOfBins.split(',') numberOfBins = map(float, numberOfBins) df[df.columns[columnIndex]] = pd.qcut(df[df.columns[columnIndex]], numberOfBins).astype(str) else: return False # Replace 'nan' strings with np.nan df[df.columns[columnIndex]].replace(to_replace="nan", value=np.nan, inplace=True)
def response_deciles (result_tbl): #requires: import pandas as pd import numpy as np #calculate gains / lift bins=10 result=pd.DataFrame(result_tbl,columns=['actual','pred']) result['decile']=(bins)-(pd.qcut(result.pred,bins,labels=False)) grp_dec=result.groupby('decile') mean_act_pred=grp_dec['actual','pred'].mean() tbl_gains=grp_dec['actual','pred'].agg(['count','sum', 'mean', 'min', 'max']).sort_values([('pred', 'mean')], ascending=False) l=pd.DataFrame(tbl_gains) l_actual=l['actual'].copy().reset_index() l_actual=l_actual.drop(['min','max'],axis=1) l_actual=l_actual.rename(columns={"mean": "Actual Response Rate","count": "Count","sum": "Responders","decile":"Decile"}) l_pred=l['pred'].copy().reset_index() l_pred=l_pred.drop(['decile','count','sum'],axis=1) l_pred=l_pred.rename(columns={"mean": "Predicted Response Rate", "min":"Min Predicted", "max":"Max Predicted"}) l_comb=pd.concat([l_actual,l_pred],axis=1) return(l_comb["Actual Response Rate"].values)
def test_qcut_include_lowest(): values = np.arange(10) ii = qcut(values, 4) ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5), Interval(4.5, 6.75), Interval(6.75, 9)]) tm.assert_index_equal(ii.categories, ex_levels)
cmap=matplotlib.colors.ListedColormap( ((0.4, 0.4, 0.4, 0), (0.4, 0.4, 0.4, 1))), vmin=0, vmax=1, alpha=1.0, zorder=20) # Plot the T2M t2m_pc = plot_cube(0.05, -180, 180, -90, 90) t2m = t2m.regrid(t2m_pc, iris.analysis.Linear()) t2m = quantile_normalise_t2m(t2m) # Adjust to show the wind wscale = 200 s = wind_noise_field.data.shape wind_noise_field.data = qcut( wind_noise_field.data.flatten(), wscale, labels=False, duplicates='drop').reshape(s) - (wscale - 1) / 2 # Plot as a colour map wnf = wind_noise_field.regrid(t2m, iris.analysis.Linear()) t2m_img = ax.pcolorfast(lons, lats, t2m.data * 1000 + wnf.data, cmap='RdYlBu_r', alpha=0.8, vmin=-100, vmax=1100, zorder=100) # PRMSL contours prmsl_pc = plot_cube(0.25, -180, 180, -90, 90)
train['GarageFinish'].isnull().sum() train['GarageYrBlt'].isnull().sum() train['GarageQual'].isnull().sum() train['GarageType'] = train['GarageType'].fillna('NG') train['GarageCond'] = train['GarageCond'].fillna('NG') train['GarageFinish'] = train['GarageFinish'].fillna('NG') train['GarageYrBlt'] = train['GarageYrBlt'].fillna('NG') train['GarageQual'] = train['GarageQual'].fillna('NG') train['BsmtExposure'] = train['BsmtExposure'].fillna('NB') train['BsmtFinType2'] = train['BsmtFinType2'].fillna('NB') train['BsmtFinType1'] = train['BsmtFinType1'].fillna('NB') train['BsmtCond'] = train['BsmtCond'].fillna('NB') train['BsmtQual'] = train['BsmtQual'].fillna('NB') train['MasVnrType'] = train['MasVnrType'].fillna('none') train.Electrical = train.Electrical.fillna('SBrkr') train["LotAreaCut"] = pd.qcut(train.LotArea, 10) train['LotFrontage'] = train.groupby( ['LotAreaCut', 'Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median())) train['LotFrontage'] = train.groupby( ['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median())) train.drop("LotAreaCut", axis=1, inplace=True) #all_columns = train.columns.values #non_categorical = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", # "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", # "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", # "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", # "ScreenPorch","PoolArea", "MiscVal"] #categorical = [value for value in all_columns if value not in non_categorical]
data['Embarked'] = data['Embarked'].map({ 'S': 0, 'C': 1, 'Q': 2 }).astype(int) traindf.head() # In[ ]: testdf['Fare'].fillna(testdf['Fare'].dropna().median(), inplace=True) testdf.head() # In[ ]: traindf['FareBand'] = pd.qcut(traindf['Fare'], 4) traindf[['FareBand', 'Survived' ]].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True) # In[ ]: for dataset in combine: dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3 dataset['Fare'] = dataset['Fare'].astype(int)
def perform_operations(df, col_name, operations): """ Execute operations on a certain column in the dataframe. Dtypes Operations: Description: ALL drop drop the entire column Numerics log perform log transformation on the column box cox perform box cox transformation on the column drop0 drop all values with zeros in it absneg absolute value the negatives median0 replace 0 with the median binning_NUM create NUM amount of bins outlierZ_NUM remove outliers with z score > NUM shiftmin subtract the columns by the minimum value Datetime finddays convert datetime to days since the first day Categorical/Object cbinning_NUM create bins where each bin must have occurences of NUM or higher mostcommon replace nan with most common category :param df: dataframe :type df: pandas.core.frame.DataFrame :param col_name: name of column :type col_name: str :param operations: list of operations to perform on the certain column :type operations: list :returns: transformed dataframe :rtype: pandas.core.frame.DataFrame """ col = df[col_name] # iterate throughout the list of transformations for each column for operation in operations: if operation == 'drop': # immediately returns the dataframe since no more operations can be # performed on a drop column return df.drop(col_name, axis=1) # numeric columns elif str(col.dtype) in {'int8', 'int16', 'int32', 'float64'}: if operation == 'log': col = np.log(1 + col) # to make sure no divide by zero elif operation == 'box cox': col = ss.boxcox(col + 0.001) # to make sure no divide by zero elif operation == "drop0": df = df[col != 0] col = col[col != 0] elif operation == "absneg": col = col.abs() elif operation == "median0": from sklearn.preprocessing import Imputer col[col == 0] = np.nan imputer = Imputer(strategy="median") col = imputer.fit_transform(col.values.reshape(-1, 1)) elif operation.split('_')[0] == 'binning': # name would be binning_NUM num = int(operation.split('_')[1]) quantile_list = [i / (num - 1) for i in range(num)] # this column with DROP_ will eventually be dropped. # It is here if one were to reference the the bins df["DROP_" + col_name] = pd.qcut( col, q=quantile_list, duplicates='raise', ) col = pd.qcut( col, q=quantile_list, duplicates='raise', labels=quantile_list[1:] ) elif operation.split('_')[0] == 'outlierZ': z = np.abs(ss.zscore(col)) keep_values = z < float(operation.split('_')[1]) df = df[keep_values] col = col[keep_values] elif operation == "shiftmin": col = col - col.min() else: raise ValueError('Not an available operation for numerics') # datetime columns elif str(col.dtype) in {'datetime64[ns]'}: # TODO: add more datetime dtypes (not sure if that is the only one) if operation == "finddays": # TODO: should NOT be min, will not generalize to validation/test col = (col - min(col)).dt.days # categorical or object columns elif str(col.dtype) in {'category', 'object'}: if operation.split('_')[0] == "cbinning": num = float(operation.split('_')[1]) value_counts = col.value_counts() x = col.replace(value_counts) df[col_name][df[col_name] == '0'] = np.nan df[col_name] = df[col_name].cat.add_categories(['OTHER']) df[col_name] = df[col_name].fillna('OTHER') df.loc[x < num, col_name] = 'OTHER' return df elif operation == "mostcommon": most_common = col.value_counts().index[0] col = col.fillna(most_common) else: raise ValueError( 'Not an available operation for categoricals or objects') else: raise ValueError('Not an available data type') df[col_name] = col return df
ax1.set_title("Box plot for all the values", fontsize=20) plt.setp(ax1.get_xticklabels(), ha="right", rotation=35) plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15) ax1.boxplot(no_null_col) ax1 = fig3.add_subplot(2,3,2) ax1.set_title("Distribution of all values", fontsize=20) plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15) plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15) ax1.hist(no_null_col) ax1 = fig3.add_subplot(2,3,3) ax1.set_title("Boxplot for quartiles (all values)", fontsize=20) if len(no_null_col.value_counts()) >= 4: data[u'quartiles'] = pd.qcut( data[col_name], 4, duplicates='drop') data.boxplot(column= col_name, by=u'quartiles', ax = ax1) plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15) plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15) ax1 = fig3.add_subplot(2,3,4) ax1.set_title("Box plot without outliers", fontsize=20) plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15) plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15) ax1.boxplot(no_null_col, showfliers=False) ax1 = fig3.add_subplot(2,3,5) ax1.set_title("Violin plot (<95% percentile)", fontsize=20) plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15) plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)
def automate_Raking(Data): """ This functions which Data file with [Company Name, Para1,para2 ....., ParaN, Shareprice_Appriciation] Get the combinations columsn list... Created DataFrames with this combinations.. ## Checking with Multicolinearity with parameters.. Threshold 0.75 ## Ranking on Parameters ## Column name== Parametername + _Rank ## Avg_value of Ranking parameters ##Column name = Avg_Weightage_Rank ## ranking on Avg_Weightage_Rank ###Column = Weightages_Avarage_Rank ## sort_quartiles by Return DataFrame with Quartiles.. Returns : my_dfs ==> After multicolinearity all the combinations DataFrames... Sorted_dfs ==> Group by Quartiles DataFrames... reductions_Dfs ==> Reductions Dfs... """ Df = Data Df = Df.fillna(0) # Df = Df[Df.iloc[:,-1].replace({0:-1})] Df = Df.copy() df_list = list(Df.columns) fina_ls = [] for i in range(1, len(df_list[1:])): s = rSubset(df_list[2:-1], i) combi_list = [] for j in s: combi_list.append(list(j)) fina_ls.append(combi_list) print("Length of Cobmbinations", len(fina_ls)) ## Created dataframes with all combinations... multi_corr = [] # fina_ls[0] for j in fina_ls[:]: for i in j[:]: i.insert(0, 'Company Name') i.insert(1, 'Portfolio') i.extend(['Shareprice_Appriciation']) df1 = pd.DataFrame(Df[i]) multi_corr.append(df1) ############# ### Getting Non Multi collinearity commbinations All_Dataframes = [] reductions_Dfs = [] for i in range(len(multi_corr[:])): n = pd.DataFrame(multi_corr[i].iloc[:, :-1].corr()[:] >= 0.75) leng = len(n) s = n.values j = np.eye(leng) == 1 comparison = s == j equal_arrays = comparison.all() if equal_arrays == True: All_Dataframes.append(multi_corr[i]) else: reductions_Dfs.append(multi_corr[i]) print('After Multi_Collinearity', len(All_Dataframes)) print('Reductions ', len(reductions_Dfs)) # ## Giving the ranks to features... depends on correlations with Return%... for frame in All_Dataframes: copied_frame = frame.copy() correlation = frame.corr() copied_Cor = correlation.copy() for j in range(0, len(copied_Cor.columns) - 1): ## Its -2 columns = list(copied_Cor.columns) #print('Value',columns[j]) columns_name = columns[j] k = len(copied_Cor.columns) - 1 #print('K Value',k) i = j + 2 ## J+ 4 means after from 5th index # if copied_Cor.iloc[j,k] >= 0.05: ## Dont use frame[str(columns_name) + '_Rank'] = copied_frame.iloc[:, i].rank( method='first', ascending=0) my_dfs = All_Dataframes.copy() for f in my_dfs: L = f.columns.get_loc('Shareprice_Appriciation') + 1 col = f.iloc[:, L:] f['Avg_Rank'] = col.mean(axis=1).round() Avg_ranks = my_dfs.copy() my_dfs = [] for new in Avg_ranks: i = new.columns.get_loc("Shareprice_Appriciation") + 1 j = new.columns.get_loc("Avg_Rank") l = len(new.columns[i:j]) z = np.ones(l).tolist() for k in range(0, l): arrs = z.copy() for m in range(1, 6): arrs[k] = m df1 = new.copy() p = df1.columns.get_loc("Shareprice_Appriciation") + 1 q = df1.columns.get_loc("Avg_Rank") cols = list(df1.columns[p:q]) weightage_list = [ '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_' ] Separater_list = [ '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|' ] w_c = get_columnnames(weightage_list, cols, arrs, Separater_list) # print(p,q) # print(cols) # print(arrs) df1[w_c] = df1.iloc[:, p:q] * arrs # print(df1) df = pd.DataFrame(df1) # print(df) my_dfs.append(df) #print('***') #print('#####') for frames in my_dfs: # frame = frames.copy() i = frames.columns.get_loc("Avg_Rank") + 1 frames['Weightages_Avarage_Rank'] = frames.iloc[:, i:].mean( axis=1).round() ## j = frames.columns.get_loc("Weightages_Avarage_Rank") frames['Weighatages_Rank'] = frames.iloc[:, j].rank(method='first', ascending=1) ## lables_ = [] for i in range(1, int(np.sqrt(Df.shape[0]).round()) + 1): lab = 'Q' + str(i) lables_.append(lab) Sorted_dfs = [] for frames in my_dfs: #frames["Quartiles"] = pd.qcut(frames['Weightages_Avarage_Rank'].rank(method='first'), int(np.sqrt(frames.shape[0]).round()) , labels=["Q1", "Q2", "Q3","Q4","Q5","Q6","Q7"]) frames["Quartiles"] = pd.qcut( frames['Weighatages_Rank'].rank(method='first'), int(np.sqrt(frames.shape[0]).round()), labels=lables_) Testing_Q = frames.copy() Sort_df = sort_quartiles(Testing_Q) sortted_q = list(Sort_df.iloc[:, 0]) # if sortted_q[0] > sortted_q[1] > sortted_q[2] > sortted_q[3]: # print('Yes Falling down...') # Falling_down.append(list(Sort_df.columns)) # else: Sorted_dfs.append(Sort_df) return my_dfs, Sorted_dfs, reductions_Dfs
'perfect_pred', ascending=False)[diagnostic_cols_to_show].head()) print("MOVES LEAST LIKELY TO MAKE THE BEST MOVE:") print( moves_to_test.sort('perfect_pred', ascending=True)[diagnostic_cols_to_show].head()) else: imperfect_moves = moves_to_test[moves_to_test['clipped_movergain'] < 0] X = imperfect_moves[features] y = imperfect_moves['clipped_movergain'] pred_y = model.predict(X) mask = y < pred_y score = float(mask.sum()) / y.shape[0] print(( 'imperfect-move error-size quantile model for %s: true quantile is %f' % (key, score))) combo = concat([Series(y.values), Series(pred_y)], axis=1) combo_groups = qcut(combo[1], 10) combo_stats = combo.groupby(combo_groups)[0].agg({ 'mean': np.mean, 'q': lambda x: np.percentile(x, float(mg_quant) * 100), 'count': len }) print(("%s distribution of error by prediction range:\n%s" % (elo_name, combo_stats)))
def add_stats(df): df['gp'] = df.apply(active_games, axis=1) df['min_3g_avg'] = df.apply(min_3g_avg, axis=1) #df['min_7d_avg'] = df.apply(min_avg_7_days, axis=1) df['min_90d_avg'] = df.apply(min_avg_90_days, axis=1) df['dk_avg_90_days'] = df.apply(dk_avg_90_days, axis=1) # df['teampts_avg'] = df.apply(team_pts_90_days, axis=1) # df['opppts_avg'] = df.apply(opp_pts_90_days, axis=1) df['dk_per_min'] = df['dk_avg_90_days'] / df['min_90d_avg'] # transform DK points to more normal distro df['DKP_trans'] = df['DKP']**.5 # create columns for - positive DK change; negative DK change # df['dk_sal_increase'] = np.where((df['dk_change'] > 0), True, False) # df['dk_sal_decrease'] = np.where((df['dk_change'] < 0), True, False) # create standard dev and max columns df['dk_std_90_days'] = df.apply(dk_std_90_days, axis=1) df['dk_max_30_days'] = df.apply(dk_max_30_days, axis=1) # get min when starting / bench df['min_when_start'] = df.apply(min_when_starting, axis=1) df['min_when_bench'] = df.apply(min_when_bench, axis=1) # count games started in past week df['starts_past_week'] = df.apply(starts_past_week, axis=1) # adjust minutes df['min_proj'] = df.apply(adjust_minutes, axis=1) # add dvp df['dvp'] = df.apply(dvp, axis=1) # add dvp rank df['dvprank'] = pd.qcut( df['dvp'], [ 0.05, 0.1, 0.25, 0.5, 0.75, .93, 1], labels=False) # combine PACE and dvp df['pace_dvp'] = (df['pace_sum'] / 10) + df['dvp'] # create summary stats df['pts'] = df['Stats'].str.extract('(\d*)pt') df['rbs'] = df['Stats'].str.extract('(\d*)rb') df['stl'] = df['Stats'].str.extract('(\d*)st') df['ast'] = df['Stats'].str.extract('(\d*)as') df['blk'] = df['Stats'].str.extract('(\d*)bl') df['3pm'] = df['Stats'].str.extract('(\d*)trey') df['fgm'] = df['Stats'].str.extract('(\d*)-\d*fg') df['fga'] = df['Stats'].str.extract('\d*-(\d*)fg') df['ftm'] = df['Stats'].str.extract('(\d*)-\d*ft') df['fta'] = df['Stats'].str.extract('\d*-(\d*)ft') df['tov'] = df['Stats'].str.extract('(\d*)to') df[['pts', 'rbs', 'stl', 'ast', 'blk', '3pm', 'fgm', 'fga', 'ftm', 'fta', 'tov']] = df[['pts', 'rbs', 'stl', 'ast', 'blk', '3pm', 'fgm', 'fga', 'ftm', 'fta', 'tov']].apply(lambda x: pd.to_numeric(x, errors='coerce')) df[['pts', 'rbs', 'stl', 'ast', 'blk', '3pm', 'fgm', 'fga', 'ftm', 'fta', 'tov']].fillna(0, inplace=True) # add yesterdays minutes df['min_yest'] = df.apply(min_yest, axis=1) # create back to back boolean column [over 30 minutes played the prior day] df['b2b'] = df.apply(create_b2b_bool, axis=1) # fillna just in case df['Minutes'] = df['Minutes'].fillna(value=0) df['fga'] = df['fga'].fillna(value=0) df['fta'] = df['fta'].fillna(value=0) df['tov'] = df['tov'].fillna(value=0) # add team stats for usage calc df['team_mp'] = df.apply(team_mp, axis=1) df['team_fga'] = df.apply(team_fga, axis=1) df['team_fta'] = df.apply(team_fta, axis=1) df['team_tov'] = df.apply(team_tov, axis=1) # add individual usage / 3 game rolling avg df['usage'] = df.apply(usage, axis=1) df['usage_3g_avg'] = df.apply(usage_3g_avg, axis=1) df['usage_5g_avg'] = df.apply(usage_5g_avg, axis=1) # add value / 3 game rolling avg for val df['value'] = df.apply(value, axis=1) df['value_3g_avg'] = df.apply(value_3g_avg, axis=1) # add starter min - average minutes played of all the starters df['starter_min'] = df.apply(starter_min, axis=1) # add game by game minutes vs starter average df['min_vs_starters'] = df['Minutes'] - df['starter_min'] df['mvs_5g_avg'] = df.apply(mvs_5g_avg, axis=1) # add 3game average of starter minutes df['starter_5g_avg'] = df.apply(starter_5g_avg, axis=1) # add rolling avg of fga df['fga_3g_avg'] = df.apply(fga_3g_avg, axis=1) # add double double count df['dbl_dbl_cnt'] = df.apply(dbl_dbl, axis=1) # create "double double per game" stat df['dbl_dbl_per_game'] = df['dbl_dbl_cnt'] / df['gp'] # combo stat: Minutes + FGA + dbl_dbl_per_game df['combo'] = df['min_proj'] + df['dbl_dbl_per_game'] + df['fga_3g_avg'] return(df)
embarkedmapping = {"S": 1, "C": 2, "Q": 3} trainingset['Embarked'] = trainingset['Embarked'].map(embarkedmapping) testingset['Embarked'] = testingset['Embarked'].map(embarkedmapping) testingset.head() # In[ ]: #FILLING MISSING FARE VALUES AND MAPPING THEM INTO NUMERIC VALUES #MISSING VALUE IS BASED ON THE CLASS OF THE PASSENGER for x in range(len(testingset["Fare"])): if pd.isnull(testingset["Fare"][x]): pclass = testingset["Pclass"][x] testingset["Fare"][x] = round(trainingset[trainingset["Pclass"] == pclass]["Fare"].mean(), 4) trainingset['FareBin'] = pd.qcut(trainingset['Fare'], 4, labels = [1, 2, 3, 4]) testingset['FareBin'] = pd.qcut(testingset['Fare'], 4, labels = [1, 2, 3, 4]) trainingset = trainingset.drop(['Fare'], axis = 1) testingset = testingset.drop(['Fare'], axis = 1) testingset.head() # # (6). Algorithm Modelling # We will now use the training set to test the accuracy of the SVM, RF, KNN and DT algorithms. # In[ ]: from sklearn.model_selection import train_test_split p = trainingset.drop(['Survived', 'PassengerId'], axis=1) targetset = trainingset["Survived"]
train = train.drop("Name",axis=1) test = test.drop("Name",axis=1) # In[ ]: print(test.head()) print(train.head()) # In[ ]: train['Survived'].groupby(pd.qcut(train['Ticket_len'], 4)).mean() #train['Ticket_len'].groupby(train['Survived']).mean() # In[ ]: X_train = train.drop("Survived",axis=1) Y_train = train["Survived"] X_test = test.drop("PassengerId",axis=1).copy() # In[ ]: X_train = X_train.drop("PassengerId",axis=1)
""" Created on Sun Mar 15 21:02:40 2020 @author: reocar """ # 等距分厢 # 等频分箱 import pandas as pd from sklearn.cluster import KMeans from sklearn import datasets df = pd.DataFrame([[22, 1], [13, 1], [33, 1], [52, 0], [16, 0], [42, 1], [53, 1], [39, 1], [26, 0], [66, 0]], columns=['age', 'Y']) df['age_bin_2'] = pd.cut(df['age'], 3) #等距分箱 df['age_bin_1'] = pd.qcut(df['age'], 3) #等频分箱 display(df) # k-mean分箱(待修改) iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666) kmodel = KMeans(n_clusters=2) #k为聚成几类 kmodel.fit(X_train[:, 0].reshape(len(X_train[:, 0]), 1)) #训练模型 c = pd.DataFrame(kmodel.cluster_centers_) #求聚类中心 c = c.sort_values(by=0) #排序 w = c.rolling(2).mean().iloc[1:] #用滑动窗口求均值的方法求相邻两项求中点,作为边界点
str) + '_' + data_all['dist1'].astype(str) # Amt def Amt_decimal_len(amount): split = str(amount).split('.') if len(split) > 1: return len(split[-1]) return 0 data_all['Amt_decimal_len'] = data_all['TransactionAmt'].map(Amt_decimal_len) data_all['Amt_decimal'] = ( (data_all['TransactionAmt'] - data_all['TransactionAmt'].astype(int)) * 1000).astype(int) data_all['Amt_interval'] = pd.qcut(data_all['TransactionAmt'], 20) cols = [ 'ProductCD', 'card1', 'card2', 'card5', 'card6', 'addr1', 'P_email', 'R_email' ] for f in cols: data_all[f'Amt_mean_{f}'] = data_all.groupby( [f])['TransactionAmt'].transform('mean') data_all[f'Amt_std_{f}'] = data_all.groupby( [f])['TransactionAmt'].transform('std') data_all[f'Amt_pct_{f}'] = ( data_all['TransactionAmt'] - data_all[f'Amt_mean_{f}']) / data_all[f'Amt_std_{f}'] print('Amt cols are done.')
"Teenager": 3, "Student": 4, "Young Adult": 5, "Adult": 6, "Senior": 7 } train['AgeGroup'] = train['AgeGroup'].map(age_mapping) test['AgeGroup'] = test['AgeGroup'].map(age_mapping) train = train.drop(['Age'], axis=1) test = test.drop(['Age'], axis=1) train.head() # Fare: 티켓의 요금 # qcut 함수를 사용. 4개의 범위로 cut train['FareBand'] = pd.qcut(train['Fare'], 4, labels={1, 2, 3, 4}) test['FareBand'] = pd.qcut(test['Fare'], 4, labels={1, 2, 3, 4}) train = train.drop(['Fare'], axis=1) test = test.drop(['Fare'], axis=1) train.head() # ********************* # 데이터 모델링 # ********************** train_data = train.drop('Survived', axis=1) target = train['Survived'] train_data.shape, target.shape # ((891, 8), (891,))
"C": 1, "Q": 2 }).astype(int) #Family Stuff for dataset in combine: dataset["FamilySize"] = dataset["SibSp"] + dataset[ "Parch"] + 1 #getting family size for dataset in combine: dataset["IsAlone"] = 0 dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1 #Creating a interval for fare for dataset in combine: dataset["Fare"] = dataset["Fare"].fillna(train["Fare"].median()) train["CategoricalFare"] = pd.qcut(train["Fare"], 4) #mapping fare for dataset in combine: dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3 dataset['Fare'] = dataset['Fare'].astype(int) #looking at the titles in the names for dataset in combine: dataset["Title"] = dataset.Name.str.extract(" ([A-Za-z]+)\.", expand=False) #print(pd.crosstab(train["Title"], train["Sex"]))
def plot_leaflet_network( wn, node_attribute=None, link_attribute=None, node_attribute_name='Value', link_attribute_name='Value', node_size=2, node_range=[None, None], node_cmap=['cornflowerblue', 'forestgreen', 'gold', 'firebrick'], node_cmap_bins='cut', node_labels=True, link_width=2, link_range=[None, None], link_cmap=['cornflowerblue', 'forestgreen', 'gold', 'firebrick'], link_cmap_bins='cut', link_labels=True, add_legend=False, round_ndigits=2, zoom_start=13, add_to_node_popup=None, add_to_link_popup=None, filename='leaflet_network.html'): """ Create an interactive scalable network graphic on a Leaflet map using folium. Parameters ---------- wn : wntr WaterNetworkModel A WaterNetworkModel object node_attribute : None, str, list, pd.Series, or dict, optional - If node_attribute is a string, then a node attribute dictionary is created using node_attribute = wn.query_node_attribute(str) - If node_attribute is a list, then each node in the list is given a value of 1. - If node_attribute is a pd.Series, then it should be in the format {nodeid: x} where nodeid is a string and x is a float. - If node_attribute is a dict, then it should be in the format {nodeid: x} where nodeid is a string and x is a float link_attribute : None, str, list, pd.Series, or dict, optional - If link_attribute is a string, then a link attribute dictionary is created using edge_attribute = wn.query_link_attribute(str) - If link_attribute is a list, then each link in the list is given a value of 1. - If link_attribute is a pd.Series, then it should be in the format {linkid: x} where linkid is a string and x is a float. - If link_attribute is a dict, then it should be in the format {linkid: x} where linkid is a string and x is a float. node_attribute_name : str, optional The node attribute name, which is used in the node popup and node legend link_attribute_name : str, optional The link attribute name, which is used in the link popup and link legend node_size : int, optional Node size node_range : list, optional Node range ([None,None] indicates autoscale) node_cmap : list of color names, optional Node colors node_cmap_bins: string, optional Node color bins, 'cut' or 'qcut' node_labels: bool, optional If True, the graph will include each node labelled with its name. link_width : int, optional Link width link_range : list, optional Link range ([None,None] indicates autoscale) link_cmap : list of color names, optional Link colors link_cmap_bins: string, optional Link color bins, 'cut' or 'qcut' link_labels: bool, optional If True, the graph will include each link labelled with its name. add_legend: bool, optional Add a legend to the map round_ndigits : int, optional Rounds digits in the popup zoom_start : int, optional Zoom start used to set initial scale of the map add_to_node_popup : None or pd.DataFrame, optional To add additional information to the node popup, use a DataFrame with node name as index and attributes as values. Column names will be added to the popup along with each value for a given node. add_to_link_popup : None or pd.DataFrame, optional To add additional information to the link popup, use a DataFrame with link name as index and attributes as values. Column names will be added to the popup along with each value for a given link. filename : str, optional Filename used to save the map """ if folium is None: raise ImportError('folium is required') if node_attribute is not None: if isinstance(node_attribute, list): node_cmap = ['red'] node_attribute = _format_node_attribute(node_attribute, wn) node_attribute = pd.Series(node_attribute) if node_range[0] is not None: node_attribute[node_attribute < node_range[0]] = node_range[0] if node_range[1] is not None: node_attribute[node_attribute > node_range[1]] = node_range[1] if node_cmap_bins == 'cut': node_colors, node_bins = pd.cut(node_attribute, len(node_cmap), labels=node_cmap, retbins=True) elif node_cmap_bins == 'qcut': node_colors, node_bins = pd.qcut(node_attribute, len(node_cmap), labels=node_cmap, retbins=True) if link_attribute is not None: if isinstance(link_attribute, list): link_cmap = ['red'] link_attribute = _format_link_attribute(link_attribute, wn) link_attribute = pd.Series(link_attribute) if link_range[0] is not None: link_attribute[link_attribute < link_range[0]] = link_range[0] if link_range[1] is not None: link_attribute[link_attribute > link_range[1]] = link_range[1] if link_cmap_bins == 'cut': link_colors, link_bins = pd.cut(link_attribute, len(link_cmap), labels=link_cmap, retbins=True) elif link_cmap_bins == 'qcut': link_colors, link_bins = pd.qcut(link_attribute, len(link_cmap), labels=link_cmap, retbins=True) G = wn.get_graph() pos = nx.get_node_attributes(G, 'pos') center = pd.DataFrame(pos).mean(axis=1) m = folium.Map(location=[center.iloc[1], center.iloc[0]], zoom_start=zoom_start, tiles='cartodbpositron') #folium.TileLayer('cartodbpositron').add_to(m) # Node popup node_popup = {k: '' for k in wn.node_name_list} if node_labels: for name, node in wn.nodes(): node_popup[name] = node.node_type + ': ' + name if node_attribute is not None: if name in node_attribute.index: node_popup[name] = node_popup[name] + '<br>' + \ node_attribute_name + ': ' + '{:.{prec}f}'.format(node_attribute[name], prec=round_ndigits) if add_to_node_popup is not None: if name in add_to_node_popup.index: for key, val in add_to_node_popup.loc[name].iteritems(): node_popup[name] = node_popup[name] + '<br>' + \ key + ': ' + '{:.{prec}f}'.format(val, prec=round_ndigits) # Link popup link_popup = {k: '' for k in wn.link_name_list} if link_labels: for name, link in wn.links(): link_popup[name] = link.link_type + ': ' + name if link_attribute is not None: if name in link_attribute.index: link_popup[name] = link_popup[name] + '<br>' + \ link_attribute_name + ': ' + '{:.{prec}f}'.format(link_attribute[name], prec=round_ndigits) if add_to_link_popup is not None: if name in add_to_link_popup.index: for key, val in add_to_link_popup.loc[name].iteritems(): link_popup[name] = link_popup[name] + '<br>' + \ key + ': ' + '{:.{prec}f}'.format(val, prec=round_ndigits) if node_size > 0: for name, node in wn.nodes(): loc = (node.coordinates[1], node.coordinates[0]) radius = node_size color = 'black' if node_labels: popup = node_popup[name] else: popup = None if node_attribute is not None: if name in node_attribute.index: color = node_colors[name] else: radius = 0.1 folium.CircleMarker(loc, popup=popup, color=color, fill=True, fill_color=color, radius=radius, fill_opacity=0.7, opacity=0.7).add_to(m) if link_width > 0: for name, link in wn.links(): start_loc = (link.start_node.coordinates[1], link.start_node.coordinates[0]) end_loc = (link.end_node.coordinates[1], link.end_node.coordinates[0]) weight = link_width color = 'black' if link_labels: popup = link_popup[name] else: popup = None if link_attribute is not None: if name in link_attribute.index: color = link_colors[name] else: weight = 1.5 folium.PolyLine([start_loc, end_loc], popup=popup, color=color, weight=weight, opacity=0.7).add_to(m) if (add_legend) & ((len(node_cmap) >= 1) or (len(link_cmap) >= 1)): if node_attribute is not None: #Produce node legend height = 50 + len(node_cmap) * 20 + ( int(len(node_attribute_name) / 20) + 1) * 20 node_legend_html = """<div style="position: fixed; bottom: 50px; left: 50px; width: 150px; height: """ + str( height) + """px; background-color:white;z-index:9999; font-size:14px; "><br> <b><P ALIGN=CENTER>""" + "Node Legend: " + node_attribute_name + """</b> </P>""" for color, val in zip(node_cmap, node_bins[0:-1]): val = '{:.{prec}f}'.format(val, prec=round_ndigits) node_legend_html += """  <i class="fa fa-circle fa-1x" style="color:""" + color + """ "></i> >= """ + val + """ <br>""" node_legend_html += """</div>""" m.get_root().html.add_child(folium.Element(node_legend_html)) if link_attribute is not None: #Produce link legend height = 50 + len(link_cmap) * 20 + ( int(len(link_attribute_name) / 20) + 1) * 20 link_legend_html = """<div style="position: fixed; bottom: 50px; left: 250px; width: 150px; height: """ + str(height) + """px; background-color:white;z-index:9999; font-size:14px; "><br> <b><P ALIGN=CENTER>""" + "Link Legend: " + link_attribute_name + """</b> </P>""" for color, val in zip(link_cmap, link_bins[0:-1]): val = '{:.{prec}f}'.format(val, prec=round_ndigits) link_legend_html += """  <i class="fa fa-minus fa-1x" style="color:""" + color + """ "></i> >= """ + val + """ <br>""" link_legend_html += """</div>""" m.get_root().html.add_child(folium.Element(link_legend_html)) #plugins.Search(points, search_zoom=20, ).add_to(m) #if add_longlat_popup: # m.add_child(folium.LatLngPopup()) folium.LayerControl().add_to(m) m.save(filename)
def get_Data(**kw): kwargs = {"varName":None, "args":None, "qcut":0, "qType":"norm", "orth":False } #kwargs["orth"] = True kwargs.update(kw) data = briefstats.data data.loc[:,"vwap"] = briefstats.get_vwap(1).values.reshape(-1) X = pd.DataFrame() _varName = kwargs["varName"] if not isinstance(_varName,list): _varName = [_varName,] filename = get_hash(_varName, kwargs["args"]) try: __col = np.load('data/col{}.npy'.format(filename)) __index = np.load("data/index{}.npy".format(filename)) __values = np.load("data/values{}.npy".format(filename)) X = pd.DataFrame(__values,columns=__col,index=__index) return X except: pass mabp = (data["askPrc"] + data["bidPrc"]) / 2 mabpD = mabp.diff(1) mabpD.iloc[0] = 0 _ratio = ((data["askQty"] * data["askPrc"] - data["bidQty"] * data["bidPrc"]) / (data["askQty"] * data["askPrc"] + data["bidQty"] * data["bidPrc"])).values.reshape(-1) qwap = (data["askPrc"]*data["askQty"] + data["bidPrc"]*data["bidQty"]) / (data["askQty"]+data["bidQty"]) qwapD = qwap.diff(1) qwapD.iloc[0] = 0 def args(v,default=1,lb=None): return get_args(v,kwargs['args'],default,lb=lb) for varName in _varName: try: print -1,varName X.loc[:,varName] = data[varName].values.reshape(-1) #logging.debug(tmp) continue # return tmp except: try: _vwap = re.match("vwap",varName).span() if _vwap is not None: tmp = varName[_vwap[1]:] if tmp == "D": X.loc[:,"vwapD"] = data["vwap"].diff(1).values.reshape(-1) X.iloc[0, -1] = 0 continue # return X if tmp == "Log": X.loc[:,"vwapLog"] = np.log(data["vwap"]).diff(1).values.reshape(-1) X.iloc[0, -1] = 0 continue # return X if tmp == "": X.loc[:,"vwap"] = briefstats.get_vwap(1).values.reshape(-1) continue if tmp == "DEWM": vwap = data["vwap"].diff(1) vwap[0] = 0 vwapEwm = vwap.ewm(com=1).mean() X.loc[:,"vwapDEWM"] = vwapEwm.values # return X except: logging.debug("X data do not have vwap") try: _last = re.match("last", varName).span() if _last is not None: logging.debug(["_last is not none",_last]) tmp = varName[_last[1]:] logging.debug(tmp) if tmp == "D": X.loc[:, "lastD"] = data["last"].diff(1).values.reshape(-1) X.iloc[0, -1] = 0 continue # return X if tmp == "Log": X.loc[:, "lastLog"] = np.log(data["last"]).diff(1).values.reshape(-1) X.iloc[0, -1] = 0 continue # return X except: logging.debug("X data do not have last") #### mabp if varName == "mabp": try: X.loc[:,"mabp"] = mabp.values.reshape(-1) except: X.loc[:, "mabp"] = mabp.reshape(-1) continue if varName == "mabpEWM": for _com in args(varName,1): X.loc[:,"mabpEWM{}".format(_com)] = pd.DataFrame(mabp).ewm(com=_com).mean().values.reshape(-1) continue if varName == "mabpD": for _window in args(varName,1,lb=1): mabpDw = mabpD.rolling(window=_window).sum() mabpDw.iloc[:_window] = mabpD.values.reshape(-1)[:_window].cumsum() X.loc[:,"mabpD{}".format(_window)] = mabpDw.values.reshape(-1) #print X #logging.error(X) continue if varName == "mabpDEWM": for _com in args(varName, 1): mabpDw = mabpD.ewm(com=_com).mean() X.loc[:,"mabpDEWM{}".format(_com)] = mabpDw.values.reshape(-1) continue #### qwap if varName == "qwap": X.loc[:,"qwap"] = qwap.values.reshape(-1) continue if varName == "qwapD": X.loc[:,"qwapD"] = qwapD.values.reshape(-1) continue if varName == "qwapEWM": tmp = qwap.ewm(com=1).mean().values.reshape(-1) X.loc[:,"qwapDEWM"] = tmp continue if varName == "qwapDEWM": tmp = qwapD.ewm(com=1).mean().values.reshape(-1) X.loc[:, "qwapDEWM"] = tmp continue if varName == "askDaskbidQty": X.loc[:, "askDaskbidQty"] = data["askQty"].values / (data["bidQty"].values + data["askQty"].values) continue # return X if varName == "askDaskbidQtyEWM": for _com in args(varName,1): askQty = data["askQty"].ewm(com=_com).mean() bidQty = data["bidQty"].ewm(com=_com).mean() X.loc[:, "askDaskbidQty{}".format(_com) ] = askQty.values / (bidQty.values + askQty.values) continue # return X if varName == "askDaskbidQtyR": for _window in args(varName,2,lb=1): askQty = data["askQty"].rolling(window=_window).sum() askQty[:_window] = data["askQty"].values[:_window].cumsum() bidQty = data["bidQty"].rolling(window=_window).sum() bidQty[:_window] = data["bidQty"].values[:_window].cumsum() X.loc[:,"askDaskbidQtyR{}".format(_window)] = (askQty.values/(bidQty.values + askQty.values)).reshape(-1) # if varName == "ratio": # openInterestD = data["openInterest"].diff(1) # openInterestD.iloc[0] = 0 # X.loc[:, "ratio"] = (openInterestD.values / (data["volumeD"].values + (data["volumeD"].values == 0))).reshape(-1) # continue # # return X # if varName == "ratioL": # openInterestD = data["openInterest"].diff(1) # openInterestD.iloc[0] = 0 # ratio = openInterestD.values/(data["volumeD"].values+(data["volumeD"].values==0)) # logging.error(pd.isna(ratio).sum()) # X.loc[:,"ratioL"] = (pd.cut(ratio, bins=[-1.1, -0.75, -0.25, 0.25, 0.75, 1], labels=False) - 3).reshape(-1) # logging.error(pd.isna(X).sum()) # continue if varName == "fundSpread": fundSpread = data["askQty"]*data["askPrc"]-data["bidQty"]*data["bidPrc"] for _window in args(varName,1,lb=1): tmp = fundSpread.rolling(window=_window).sum() tmp[:_window] = fundSpread[:_window].cumsum() X.loc[:,"fundSpread{}".format(_window)] = tmp.values.reshape(-1) continue if varName == "fundSpreadEWM": for _com in args(varName,1): ask = pd.DataFrame(data["askQty"] * data["askPrc"]).ewm(com=_com).mean() bid = pd.DataFrame(data["bidQty"] * data["bidPrc"]).ewm(com=_com).mean() X.loc[:, "fundSpreadEWM{}".format(_com)] = (ask - bid).values.reshape(-1) continue if varName == "askbidDtotalRatio": X.loc[:,"askbidDtotalRatio"] = ((data["askQty"]*data["askPrc"]-data["bidQty"]*data["bidPrc"])/(data["askQty"]*data["askPrc"]+data["bidQty"]*data["bidPrc"])).values.reshape(-1) continue if varName == "askbidDtotalRatioR": ask = data["askQty"] * data["askPrc"] bid = data["bidQty"] * data["bidPrc"] for _window in args(varName,8,lb=1): _window = int(_window) tmpask = ask.rolling(window= _window).sum() tmpask[:_window] = ask[:_window].cumsum() tmpbid = bid.rolling(window=_window).sum() tmpbid[:_window] = bid[:_window].cumsum() X.loc[:,"askbidDtotalRatioR{}".format(_window)]= ((tmpask.values - tmpbid.values)/(tmpask.values + tmpbid.values)).reshape(-1) continue if varName == "askbidDtotalRatioEWM": for com in args(varName,0.1): #com = com/10 ask = data["askQty"] * data["askPrc"] ask = ask.ewm(com=com).mean() bid = data["bidQty"] * data["bidPrc"] bid = bid.ewm(com=com).mean() X.loc[:, "askbidDtotalRatioEWM{}".format(com)] = ( (ask.values - bid.values) / (ask.values + bid.values)).reshape(-1) continue # if varName == "askbidDturnover": #wuxiao # D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values)) # tmp = ((data["bidQty"]*data["bidPrc"]+data["askQty"]*data["askPrc"])/data["turnoverD"]*10)*D # tmp[np.isinf(tmp)] = np.nan # tmp.fillna(0,inplace=True) # X.loc[:,"askbidDturnover"] = tmp.values.reshape(-1) # #print X # logging.debug(["x nan",pd.isna(X["askbidDturnover"]).sum()]) # continue if varName == "sov": D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values)) for _window in args(varName,1,lb=1): obv = pd.Series(D * data["volumeD"]).rolling(window=_window).sum() obv[:_window] = (D * data["volumeD"])[:_window].cumsum() logging.debug(obv.values.reshape(-1)) X.loc[:,"sov{}".format(_window)] = obv.values.reshape(-1) logging.debug(X) continue if varName == "sovEWM": D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values)) for _com in args(varName, 1): obvD = pd.DataFrame(D * data["volumeD"]).ewm(com = _com).mean() X.loc[:, "sovEWM{}".format(_com)] = obvD.values.reshape(-1) logging.debug(obvD.values.reshape(-1)) continue if varName == "soo": D = np.array(map(lambda x: 1 if x != 0 else 0, mabpD.values)) openInterestD = data["openInterest"].diff(1) for _window in args(varName, 60): openInterestD.iloc[0] = 0 soo = pd.DataFrame(D * openInterestD).rolling(window=_window).sum() soo.iloc[:_window,0] = (D * openInterestD).values[:_window].cumsum().reshape(-1) X.loc[:, "soo{}".format(_window)] = soo.values.reshape(-1) continue if varName == "sooEWM": D = np.array(map(lambda x: 1 if x != 0 else 0, mabpD.values)) openInterestD = data["openInterest"].diff(1) openInterestD.iloc[0] = 0 for _com in args(varName, 1): oboD = pd.DataFrame(D * openInterestD).ewm(com = _com).mean() X.loc[:, "sooEWM{}".format(_com)] = oboD.values.reshape(-1) continue if varName == "signUpDown": D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values)) for _window in args(varName,2,lb=1): tmp = pd.DataFrame(D).rolling(window=_window).sum() tmp.iloc[:_window,0] = D[:_window].cumsum() if True: tmp = pd.qcut(tmp.values.reshape(-1), 10, duplicates='drop', labels=False) tmp = tmp * (10 / tmp.max()) X.loc[:, "signUpDown{}".format(_window)] = tmp.reshape(-1) else: X.loc[:,"signUpDown{}".format(_window)] = tmp.values.reshape(-1) continue if varName == "signUpDownL": D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values)) tmp = np.full(shape=(len(D),),fill_value=0.0) _t = 0 for i in xrange(len(tmp)): tmp[i] = _t if D[i] > 0: if _t > 0: _t += 1 else: _t = 1 elif D[i] < 0: if _t < 0: _t -= 1 else: _t = -1 else: _t = 0 X.loc[:,"signUpDownL"] = tmp.reshape(-1) continue if varName == "midDvwap": for _diff in args(varName,5,lb=1): vwap = briefstats.get_vwap(_diff) X.loc[:,"midDvwap{}".format(_diff)] = mabp.values - vwap.values continue if varName == "qwapDvwap": qwap = briefstats.get_qwap() for _diff in args(varName,5,lb=1): vwap = briefstats.get_vwap(_diff) X.loc[:, "qwapDvwap{}".format(_diff)] = qwap.values - vwap.values #print X continue if varName == "rsv": for _window in args(varName,8,lb=2): X.loc[:,'rsv{}'.format(_window)]=get_rsv(mabp,window=_window).values.reshape(-1) continue if varName == "rsvEWM": for _com in args(varName,0.5): rsv = get_rsv(mabp) rsvEWM = rsv.ewm(_com) X.loc[:,"rsvEWM{}".format(_com)] = rsvEWM.mean() continue if varName == "rsvEWM": for _com in args(): pass if kwargs["qcut"]>0: _columns = X.columns if kwargs["qType"] == 'rank': for _col in _columns: tmp = pd.qcut(X.loc[:,_col],kwargs["qcut"],duplicates='drop',labels=False) tmp = tmp*(kwargs["qcut"]/tmp.max()) X.loc[:,_col] = tmp.values.reshape(-1) elif kwargs["qType"] in {'mid','left','right'}: for _col in _columns: tmp = pd.qcut(X.loc[:,_col],kwargs["qcut"],duplicates='drop').apply(lambda x: getattr(x, kwargs["qType"])).pipe(np.asarray) X.loc[:,_col] = tmp.reshape(-1) else: pass if kwargs["orth"]: _columns = X.columns for _col in _columns: tmp = orth(X.loc[:,_col],_ratio) try: X.loc[:, _col] = tmp.values.reshape(-1) except: X.loc[:, _col] = tmp.reshape(-1) filename = get_hash(_varName, kwargs["args"]) try: np.save('data/col{}.npy'.format(filename),X.columns) np.save("data/index{}.npy".format(filename),X.index) np.save("data/values{}.npy".format(filename),X.values) logging.debug("save sucessed") except: logging.debug("save failed") pass return X
dataset.loc[(dataset["Age"] > 29) & (dataset["Age"] <= 39), "Age"] = 3 dataset.loc[(dataset["Age"] > 29) & (dataset["Age"] <= 39), "Age"] = 3 dataset.loc[dataset["Age"] > 39, "Age"] = 4 sns.countplot(x="Age", data=train, hue="Survived") # In[24]: ## Boxplot for Fare sns.boxplot(x=train["Survived"], y=train["Fare"]) # The skewness of Fare feature is significantly high. Thus, I discretized the number of bin size based on the third quartile value; if the last bin starts with the third quartile value when bin size = n, then n number of bin will be selected. # In[25]: ## discretize Fare pd.qcut(train["Fare"], 8).value_counts() # In[26]: for dataset in total: dataset.loc[dataset["Fare"] <= 7.75, "Fare"] = 0 dataset.loc[(dataset["Fare"] > 7.75) & (dataset["Fare"] <= 7.91), "Fare"] = 1 dataset.loc[(dataset["Fare"] > 7.91) & (dataset["Fare"] <= 9.841), "Fare"] = 2 dataset.loc[(dataset["Fare"] > 9.841) & (dataset["Fare"] <= 14.454), "Fare"] = 3 dataset.loc[(dataset["Fare"] > 14.454) & (dataset["Fare"] <= 24.479), "Fare"] = 4 dataset.loc[(dataset["Fare"] > 24.479) & (dataset["Fare"] <= 31), "Fare"] = 5
def prepare_ranges(plot_df,groupby): if groupby == 'K_value': # ranges = K_value_ranges # plot_df.loc[:, 'group_range'] = pd.cut( # plot_df[groupby], ranges).astype(str) # plot_df.loc[plot_df[groupby] > ranges[-1], # 'group_range'] = '>{}'.format(ranges[-1]) # plot_df.loc[plot_df[groupby] == ranges[0], # 'group_range'] = ' {}'.format(ranges[0]) # plot_df.loc[plot_df[groupby] < ranges[0], # 'group_range'] = '<{}'.format(ranges[0]) # qcutted = pd.qcut(plot_df[plot_df[groupby]<1][groupby], 9,duplicates='drop') # categories = qcutted.cat.categories # qcutted_str = qcutted.astype(str) # qcutted_str[qcutted_str == str(categories[0])] = '(0, {}]'.format(categories[0].right) # qcutted_str[qcutted_str == str(categories[-1])] = '({}, 1)'.format(categories[-1].left) # plot_df.loc[plot_df[groupby]<1, 'group_range'] = qcutted_str # plot_df.loc[plot_df[groupby]>=1, 'group_range'] = '>= 1' ranges = condition_number_ranges cutted = pd.cut(plot_df[groupby], ranges,include_lowest=True) categories = cutted.cat.categories plot_df.loc[:, 'group_range'] = cutted.astype(str) plot_df.loc[plot_df[groupby] > ranges[-1], 'group_range'] = '>{}'.format(ranges[-1]) plot_df.loc[plot_df['group_range'] == str(categories[0]),'group_range'] = '[{},{}]'.format(ranges[0],categories[0].right) # plot_df.loc[plot_df[groupby] < ranges[0], # 'group_range'] = '<{}'.format(ranges[0]) def custom_sort(col): vals = [] for val in col.tolist(): if ',' in val: vals.append(float(val.split(',')[1][1:-1])) else: # vals.append(float(val[2:])) vals.append(float('inf')) return pd.Series(vals) return categories,None elif groupby in ['isoform_length']: def custom_sort(col): vals = [] for val in col.tolist(): if ',' in str(val): vals.append(float(val.split(',')[1][1:-1])) else: # vals.append(float(val[1:])) vals.append(float('inf')) return pd.Series(vals) plot_df[groupby] = plot_df[groupby].astype(int) if plot_df[groupby].max() > 3000: max_threshold = 4000 lower, higher = int(plot_df[groupby].min()), 4000 step_size = 400 else: max_threshold = 2100 lower, higher = int(plot_df[groupby].min()), 2100 step_size = 200 # # max_threshold = np.ceil(np.percentile(plot_df[groupby], 80)) # # lower, higher = int(plot_df.min()), int(plot_df.max()) # # step_size = int(math.ceil((higher - lower)/n_bins)) n_bins = 10 edges = [lower] + list( range(step_size, higher+1, step_size)) cutted,categories = pd.cut( plot_df.loc[plot_df[groupby] <= max_threshold, groupby], bins=edges,include_lowest=True,retbins=True) return categories,max_threshold elif groupby in ['num_exons','num_isoforms']: def custom_sort(col): vals = [] for val in col.tolist(): if ',' in val: vals.append(float(val.split(',')[1][1:-1])) else: # vals.append(float(val[1:])) vals.append(float('inf')) return pd.Series(vals) if groupby == 'num_exons': ranges = num_exons_range else: ranges = num_isoforms_range cutted = pd.cut( plot_df[groupby], ranges, right=False) categories = cutted.cat.categories plot_df.loc[:, 'group_range'] = cutted.apply(lambda x:str(x)).astype(str) plot_df.loc[plot_df[groupby] >= ranges[-1], 'group_range'] = '>={}'.format(ranges[-1]) plot_df.loc[plot_df['group_range'] == str( categories[0]), 'group_range'] = '[{}, {})'.format(int(ranges[0]), int(categories[0].right)) return categories, ranges[-1] else: plot_df[groupby] = plot_df[groupby].astype(int) max_threshold = np.ceil(np.percentile(plot_df[groupby], 90)) if (len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())<10): n_bins = len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique()) else: n_bins = 10 qcutted,categories = pd.qcut(plot_df.loc[plot_df[groupby] <= max_threshold, groupby], n_bins,labels=False,duplicates='drop',retbins=True) # lower, higher = temp_df.min(), temp_df.max() # if (len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())<10): # n_bins = len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique()) # else: # n_bins = 10 # edges = list( # range(int(lower-1), int(higher), int(math.ceil((higher - lower)/n_bins)))) # edges.append(higher) # plot_df.loc[plot_df[groupby] <= max_threshold, 'group_range'] = pd.cut( # temp_df, bins=edges).astype('str') # plot_df.loc[plot_df[groupby] > max_threshold, # 'group_range'] = '>{}'.format(max_threshold) return categories,max_threshold
def fare_ordinal(this) -> object: this.train['FareBand'] = pd.qcut(this['Fare'], 4, labels={1, 2, 3, 4}) this.test['FareBand'] = pd.qcut(this['Fare'], 4, labels={1, 2, 3, 4}) return this
#metr = c("age","fare") metr = ["age","fare"] #summary(df[metr]) df[metr].describe() # Create nominal variables for all metric variables (for linear models) before imputing ------------------------------- #metr_binned = paste0(metr,"_BINNED_") metr_binned = [x + "_BINNED_" for x in metr] #df[metr_binned] = map(df[metr], ~ { # cut(., unique(quantile(., seq(0,1,0.1), na.rm = TRUE)), include.lowest = TRUE) #}) df[metr_binned] = df[metr].apply(lambda x: pd.qcut(x, 10).astype(object)) df[metr_binned].describe() # Convert missings to own level ("(Missing)") #df[metr_binned] = map(df[metr_binned], ~ fct_explicit_na(., na_level = "(Missing)")) df[metr_binned] = df[metr_binned].fillna("(missing)") #summary(df[metr_binned],11) df[metr_binned].describe() {print(df[x].value_counts()[:11]) for x in metr_binned} # Handling missings ---------------------------------------------------------------------------------------------- # Remove covariates with too many missings from metr #misspct = map_dbl(df[metr], ~ round(sum(is.na(.)/nrow(df)), 3)) #misssing percentage
ax1.set_title(' ') prob2=stats.probplot(data['x2'],dist=stats.norm,plot=ax2) ax2.set_xlabel('') ax2.set_title(' ') prob3=stats.probplot(data['x3'],dist=stats.norm,plot=ax3) ax3.set_xlabel('') ax3.set_title(' ') #2.连续型数据分箱(无监督型和有监督型) #无监督型:等宽 + 等频 + 聚类 #(1)固定宽度分箱 newdata=np.floor_divide(data,k) #除以k进行分箱 newdata=np.floor(np.log10(data)) #通过对数函数映射到指数宽度分箱 #(2)分位数分箱 df=data.quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]) #十分位数分箱 pd.qcut(data,4,labels=False) #四分位数分箱,并返回分箱序号 data=pd.Series(data) data.quantile([0.25,0.5,0.75]) #(3)聚类分箱 #有监督型:卡方分箱法、ID3-C4.5-CART等单变量决策树算法、信用评分建模的IV最大化分箱 #(1)卡方分箱法 #(2)基于CART的决策树分箱(每个叶子节点的样本量>=总样本量的5%;内部节点再划分所需的最小样本数>=总样本量的10%) import pandas as pd import numpy as np sample_set=pd.read_csv('data') def calc_score_median(sample_set,var): ''' 计算相邻评分的中位数,以便进行决策树二元切分
# In[7]: data = data[data['engagement'] > 0] print(data.shape) data.head() # In[9]: # Now lets check the descriptive stats data.describe() # In[10]: data['engagement_bucket'] = pd.qcut(data['engagement'], q=[0, 0.5, 0.75, 1], labels=['Low', 'Medium', 'High']) data.head() # In[11]: # sns.countplot(x='engagement_bucket', data=data) # plt.show() # In[12]: # Creating time related features such as time, day, etc. data['day'] = data['published'].dt.day data['hour'] = data['published'].dt.hour data['week_day'] = data['published'].dt.weekday
#complete missing fare with median dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True) ds_train.info() #Delete unwanted columns drop_column = ['PassengerId', 'Cabin', 'Ticket'] ds_train.drop(drop_column, axis=1, inplace=True) ds_test.drop(drop_column, axis=1, inplace=True) for dataset in X_pack: #Discrete variables dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 dataset['Title'] = dataset['Name'].str.split( ", ", expand=True)[1].str.split(".", expand=True)[0] dataset['FareBin'] = pd.qcut( dataset['Fare'], 4, labels=['cheap', 'medium', 'high', 'expensive']) dataset['AgeBin'] = pd.cut( dataset['Age'].astype(int), 5, labels=['kid', 'young_adult', 'adult', 'mature', 'old']) #cleanup rare title names stat_min = 10 title_names = (ds_train['Title'].value_counts() < stat_min ) # True/False separation ds_train['Title'] = ds_train['Title'].apply( lambda x: 'Unique' if title_names.loc[x] == True else x) print(ds_train['Title'].value_counts()) #define y variable aka target/outcome Target = ['Survived']
def bin_score_deciles(self, fname='Deciles', bar=True, line=True, ylabel_bar='Probability', xlabel_bar='Score Deciles', ylabel_line='True', xlabel_line='Predicted', opacity=0.8, title='Deciles', plot_format='.pdf'): """This method is used to plot the performance of the predicted scores of the model verses the true label based on the deciles of the predictions. First the prediction scores are divided into 10 ranges (deciles), the mean of the true label and the prediction scores are calculated for each range and plotted :param fname: str The name of the file under which the plot is stored :param bar: bool Whether a bar plot is requested for the deciles or not :param line: bool Whether a line plot is requested for the deciles or not :param ylabel_bar: str It defines what to be written on the Y-axis of the bar plot :param xlabel_bar: str It defines what to be written on the X-axis of the bar plot :param ylabel_line: str It defines what to be written on the Y-axis of the line plot :param xlabel_line: str It defines what to be written on the X-axis of the line plot :param opacity: float The degree of the opacity of the bar plot :param title: str The title of the plot :param plot_format: str This defines the format used to save the plot '.png', '.jpg', '.pdf' :return None, It saves the requested plot on disk """ df = pd.DataFrame({'CHURN_SCORE': self.pred_score, 'TRUE_SCORE': self.true_label}) deciles = pd.qcut(df['CHURN_SCORE'], 10, duplicates='drop') df['SCORE_GROUP'] = deciles.values.codes df_graph = df.groupby(['SCORE_GROUP'])['CHURN_SCORE', 'TRUE_SCORE'].mean().reset_index() if bar: fig, ax = plt.subplots(figsize=(5, 5)) ax = df_graph.plot(x='SCORE_GROUP', y='CHURN_SCORE', kind='bar', ax=ax, legend=False, color=Config.colors['RED'], label='Predicted', alpha=opacity) ax = df_graph.plot(x='SCORE_GROUP', y='TRUE_SCORE', kind='bar', ax=ax, legend=False, color=Config.colors['YEL'], label='True', alpha=opacity) ax.set_ylabel(ylabel_bar) ax.set_xlabel(xlabel_bar) ax.set_title(title + ' (bar)', fontsize=Config.TIT_FS, fontweight='bold') ax.legend(loc="best") fig.savefig(os.path.join(self.viz_dir, fname + '_bar' + plot_format), bbox_inches='tight') plt.close() if line: fig, ax = plt.subplots(figsize=(5, 5)) ax = df_graph.plot(x='CHURN_SCORE', y='TRUE_SCORE', ax=ax, legend=False, color=Config.colors['RED']) ax.set_xlabel(xlabel_line) ax.set_ylabel(ylabel_line) ax.set_title(title + ' (line)', fontsize=Config.TIT_FS, fontweight='bold') fig.savefig(os.path.join(self.viz_dir, fname + '_line' + plot_format), bbox_inches='tight') plt.close()
def main(): '''Creates example_signal_upload.csv to upload for validation and live data submission''' napi = numerapi.SignalsAPI() # read in list of active Signals tickers which can change slightly era to era eligible_tickers = pd.Series(napi.ticker_universe(), name='bloomberg_ticker') print(f"Number of eligible tickers: {len(eligible_tickers)}") # read in yahoo to bloomberg ticker map, still a work in progress, h/t wsouza ticker_map = pd.read_csv( 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv' ) print(f"Number of tickers in map: {len(ticker_map)}") # map eligible numerai tickers to yahoo finance tickers yfinance_tickers = eligible_tickers.map( dict(zip(ticker_map['bloomberg_ticker'], ticker_map['yahoo']))).dropna() bloomberg_tickers = ticker_map['bloomberg_ticker'] print(f'Number of eligible, mapped tickers: {len(yfinance_tickers)}') # download data n = 1000 # chunk row size chunk_df = [ yfinance_tickers.iloc[i:i + n] for i in range(0, len(yfinance_tickers), n) ] concat_dfs = [] print("Downloading data...") for df in chunk_df: try: # set threads = True for faster performance, but tickers will fail, script may hang # set threads = False for slower performance, but more tickers will succeed temp_df = yfinance.download(df.str.cat(sep=' '), start='2005-12-01', threads=False) temp_df = temp_df['Adj Close'].stack().reset_index() concat_dfs.append(temp_df) except: # simplejson.errors.JSONDecodeError: pass full_data = pd.concat(concat_dfs) # properly position and clean raw data, after taking adjusted close only full_data.columns = ['date', 'ticker', 'price'] full_data.set_index('date', inplace=True) # convert yahoo finance tickers back to numerai tickers full_data['bloomberg_ticker'] = full_data.ticker.map( dict(zip(ticker_map['yahoo'], bloomberg_tickers))) print('Data downloaded.') print( f"Number of tickers with data: {len(full_data.bloomberg_ticker.unique())}" ) ticker_groups = full_data.groupby('ticker') full_data['RSI'] = ticker_groups['price'].transform(lambda x: RSI(x)) # group by era (date) and create quintile labels within each era, useful for learning relative ranking date_groups = full_data.groupby(full_data.index) full_data['RSI_quintile'] = date_groups['RSI'].transform( lambda group: pd.qcut(group, 5, labels=False, duplicates='drop')) full_data.dropna(inplace=True) # create lagged features grouped by ticker ticker_groups = full_data.groupby('ticker') num_days = 5 # lag 0 is that day's value, lag 1 is yesterday's value, etc for day in range(num_days + 1): full_data[f'RSI_quintile_lag_{day}'] = ticker_groups[ 'RSI_quintile'].transform(lambda group: group.shift(day)) # create difference of the lagged features and absolute difference of the lagged features (change in RSI quintile by day) for day in range(num_days): full_data[f'RSI_diff_{day}'] = full_data[ f'RSI_quintile_lag_{day}'] - full_data[ f'RSI_quintile_lag_{day + 1}'] full_data[f'RSI_abs_diff_{day}'] = np.abs( full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}']) # define column names of features, target, and prediction feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [ f'RSI_diff_{num}' for num in range(num_days) ] + [f'RSI_abs_diff_{num}' for num in range(num_days)] print(f'Features for training:\n {feature_names}') TARGET_NAME = 'target' PREDICTION_NAME = 'signal' # read in Signals targets targets = pd.read_csv('historical_targets.csv') targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d') # merge our feature data with Numerai targets ML_data = pd.merge(full_data.reset_index(), targets, on=['date', 'bloomberg_ticker']).set_index('date') # print(f'Number of eras in data: {len(ML_data.index.unique())}') # for training and testing we want clean, complete data only ML_data.dropna(inplace=True) ML_data = ML_data[ML_data.index.weekday == 4] # ensure we have only fridays ML_data = ML_data[ML_data.index.value_counts() > 50] # drop eras with under 50 observations per era # train test split train_data = ML_data[ML_data['data_type'] == 'train'] test_data = ML_data[ML_data['data_type'] == 'validation'] # train model print("Training model...") model = GradientBoostingRegressor(subsample=0.1) model.fit(train_data[feature_names], train_data[TARGET_NAME]) print("Model trained.") # predict test data test_data[PREDICTION_NAME] = model.predict(test_data[feature_names]) # predict live data # choose data as of most recent friday last_friday = datetime.now() + relativedelta(weekday=FR(-1)) date_string = last_friday.strftime('%Y-%m-%d') try: live_data = full_data.loc[date_string].copy() except KeyError as e: print(f"No ticker on {e}") live_data = full_data.iloc[:0].copy() live_data.dropna(subset=feature_names, inplace=True) # get data from the day before, for markets that were closed # on the most recent friday last_thursday = last_friday - timedelta(days=1) thursday_date_string = last_thursday.strftime('%Y-%m-%d') thursday_data = full_data.loc[thursday_date_string] # Only select tickers than aren't already present in live_data thursday_data = thursday_data[~thursday_data.ticker.isin(live_data.ticker. values)].copy() thursday_data.dropna(subset=feature_names, inplace=True) live_data = pd.concat([live_data, thursday_data]) print(f"Number of live tickers to submit: {len(live_data)}") live_data[PREDICTION_NAME] = model.predict(live_data[feature_names]) # prepare and writeout example file diagnostic_df = pd.concat([test_data, live_data]) diagnostic_df['friday_date'] = diagnostic_df.friday_date.fillna( last_friday.strftime('%Y%m%d')).astype(int) diagnostic_df['data_type'] = diagnostic_df.data_type.fillna('live') diagnostic_df[['bloomberg_ticker', 'friday_date', 'data_type', 'signal']].reset_index(drop=True).to_csv( 'example_signal_upload.csv', index=False) print( 'Example submission completed. Upload to signals.numer.ai for scores and live submission' )
def main(): train_df = pd.read_csv('data_files/train.csv') test_df = pd.read_csv('data_files/test.csv') combine = [train_df, test_df] # print('{}'.format(train_df.columns.values)) # print('{}'.format(test_df.columns.values)) # print('{}'.format(train_df.head())) # print('{}'.format(train_df.tail())) # print('*' * 40) train_df.info() # print('*'*40) # test_df.info() # print('*' * 40) # print('{}'.format(train_df.describe(percentiles=[.61, .62]))) print('{}'.format(train_df.describe(include=['O']))) # print('{}'.format( # train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean() # .sort_values(by='Survived', ascending=False) # )) # print('{}'.format( # train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean() # .sort_values(by='Survived', ascending=False) # )) # print('{}'.format( # train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean() # .sort_values(by='Survived', ascending=False) # )) # print('{}'.format( # train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean() # .sort_values(by='Survived', ascending=False) # )) train_df_age = train_df[["Age", "Survived"]] train_df_age['Age'] = train_df_age['Age'].apply(np.round) print('{}'.format(train_df_age[["Age", "Survived"]].groupby( ['Age'], as_index=False).mean().sort_values(by='Age', ascending=True))) # g = sns.FacetGrid(train_df, col='Survived') # g.map(plt.hist, 'Age', bins=40) # # grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived') # # grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6) # grid.map(plt.hist, 'Age', alpha=.8, bins=20) # grid.add_legend() # # # grid = sns.FacetGrid(train_df, col='Embarked') # grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6) # grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep') # grid.add_legend() # grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'}) # # grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6) # grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None) # grid.add_legend() # plt.show() # lets do come cleanup of data print('Data before cleanup: {} {} {} {}'.format(train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)) train_df = train_df.drop(['Ticket', 'Cabin'], axis=1) test_df = test_df.drop(['Ticket', 'Cabin'], axis=1) combine = [train_df, test_df] print( 'Data after cleanup: train shape: {} test shape: {} combine shapes:{} {}' .format(train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)) # extracting titles from names and replacement for data_set in combine: data_set['Title'] = data_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False) # print('{}'.format(pd.crosstab(train_df['Title'], train_df['Sex']))) for data_set in combine: data_set['Title'] = data_set['Title'].replace([ 'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'Rare') data_set['Title'] = data_set['Title'].replace(['Mlle', 'Ms'], 'Miss') data_set['Title'] = data_set['Title'].replace('Mme', 'Mrs') # print('{}'.format(pd.crosstab(train_df['Title'], train_df['Sex']))) # print('{}'.format(train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())) title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} for data_set in combine: data_set['Title'] = data_set['Title'].map(title_mapping) data_set['Title'] = data_set['Title'].fillna(0) # print('{}'.format(combine[0].head())) train_df.drop(train_df[['Name', 'PassengerId']], axis=1, inplace=True) test_df.drop(test_df[['Name']], axis=1, inplace=True) # print('{}'.format(combine[0].head())) # print('{}'.format(combine[1].head())) # further changing features to numerical, ex sex: male -> 0, female -> 1 sex_mapping = {"male": 0, "female": 1} for data_set in combine: data_set['Sex'] = data_set['Sex'].map(sex_mapping).astype(int) # print('{}'.format(combine[0].head())) # we will guess NaN values of age through median, # but for given record from correlation between gender and Pclass of all passengers # grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.6) # grid.map(plt.hist, 'Age', alpha=.5, bins=20) # grid.add_legend() # plt.show() guess_ages = np.zeros((2, 3)) # for every combination of sex and Pclass for data_set in combine: for i in [0, 1]: # gender for j in [1, 2, 3]: # Pclass guess_df = data_set[(data_set['Sex'] == i) & (data_set['Pclass'] == j)]['Age'].dropna() # alternative for median # age_mean = guess_df.mean() # age_std = guess_df.std() # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std) age_guess = guess_df.median() guess_ages[i, j - 1] = int(age_guess / 0.5 + 0.5) * 0.5 # now assigning computed age guesses for i in [0, 1]: # gender for j in [1, 2, 3]: # Pclass data_set.loc[(data_set.Age.isnull()) & (data_set.Sex == i) & (data_set.Pclass == j), 'Age'] = \ guess_ages[ i, j - 1] data_set['Age'] = data_set['Age'].astype(int) # print('{}'.format(train_df.head())) train_df['AgeBand'] = pd.cut(train_df['Age'], 5) # print('{}'.format(train_df.head())) # print('{}'.format( # train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand') # ) # ) # replacing age values based on bands for data_set in combine: data_set.loc[data_set['Age'] <= 16, 'Age'] = 0 data_set.loc[(data_set['Age'] > 16) & (data_set['Age'] <= 32), 'Age'] = 1 data_set.loc[(data_set['Age'] > 32) & (data_set['Age'] <= 48), 'Age'] = 2 data_set.loc[(data_set['Age'] > 48) & (data_set['Age'] <= 64), 'Age'] = 3 data_set.loc[data_set['Age'] > 64, 'Age'] = 4 train_df.drop(['AgeBand'], 1, inplace=True) combine = [train_df, test_df] for dataset in combine: dataset['FamilySize'] = dataset['SibSp'] + dataset[ 'Parch'] + 1 # creating new feature family size, by combining parent-child, sibling-spouse print('{}'.format(train_df[['FamilySize', 'Survived']].groupby( ['FamilySize'], as_index=True).agg( ['mean', 'count']).reset_index().sort_values([('Survived', 'mean')], ascending=False))) for dataset in combine: dataset['IsAlone'] = 0 dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 # print('{}'.format(train_df.loc[train_df['IsAlone'] == 1, ['IsAlone']].count())) train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) combine = [train_df, test_df] for dataset in combine: dataset['Age*Class'] = dataset.Age * dataset.Pclass # print('{}'.format(train_df.head())) # print('{}'.format(train_df[['Age*Class', 'Survived']].groupby(['Age*Class'], as_index=False).mean())) # print('{}'.format(train_df[['Embarked', 'Survived']].groupby(['Embarked']).count())) # print('count of all: {}'.format(train_df.count())) most_freq_port = train_df.Embarked.dropna().mode()[0] # print('{}'.format(most_freq_port)) for dataset in combine: dataset['Embarked'] = dataset['Embarked'].fillna(most_freq_port) result = train_df[['Embarked', 'Survived']].groupby( ['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False) # print('{}'.format(result)) # converting embarked to numerical feature: S -> 0, C -> 1, Q -> 2 for dataset in combine: dataset['Embarked'] = dataset['Embarked'].map({ 'S': 0, 'C': 1, 'Q': 2 }).astype(int) # print('{}'.format(train_df.head())) # print('nulls in fare train: {}'.format(train_df.Fare.isnull().sum())) # print('nulls in fare test: {}'.format(test_df.Fare.isnull().sum())) # only one missing value for fare in test_df, so we can replace that with median test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True) # print('nulls in fare test: {}'.format(test_df.Fare.isnull().sum())) # print('{}'.format(test_df.head())) train_df['FareBand'] = pd.qcut(train_df['Fare'], 4) # print(train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)) # assigning fareband ordinal values based on ranges for dataset in combine: dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset['Fare'] = dataset['Fare'].astype(int) train_df = train_df.drop(['FareBand'], axis=1) print('{}'.format(train_df.head())) combine = [train_df, test_df]
train['Embarked'].fillna('S', inplace=True) train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes ###Family train['Family'] = 1 + train['SibSp'] + train['Parch'] test['Family'] = 1 + test['SibSp'] + test['Parch'] ###Solo train['Solo'] = (train['Family'] == 1) test['Solo'] = (test['Family'] == 1) ###Fare train['FareBin'] = pd.qcut(train['Fare'], 5) test['FareBin'] = pd.qcut(test['Fare'], 5) #print(train['FareBin'].value_counts()) train['Fare_clean'] = train['FareBin'].astype('category').cat.codes test['Fare_clean'] = test['FareBin'].astype('category').cat.codes #print(train['Fare_clean'].value_counts()) ###Title train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=False) test['Title'] = test['Name'].str.extract('([A-Za-z]+)\.', expand=False) train['Title'] = train['Title'].replace([ 'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Majer', 'Rev', 'Sir',
def funding_table(list_of_list, df): colname = ["campaign_price", "campaign_people", "title"] table = pd.DataFrame(columns=colname) df1 = pd.DataFrame() for i in range(len(list_of_list)): df1 = pd.DataFrame(list_of_list[i][1]) # df1 = df1.drop(["campaign_img", "campaign_content", "funding_price", "total_price", "ratio"], axis=1) title = [] for j in range(len(df1)): t = list_of_list[i][0] title.append(t) df1["title"] = title table = pd.concat([table, df1]) table = table.drop([ "campaign_img", "campaign_content", "funding_price", "total_price", "ratio" ], axis=1) table = table.sort_values(by="campaign_price") table.index = range(len(table)) grouping = pd.qcut(table["campaign_price"], 10, labels=False) grouped = table["campaign_price"].groupby(grouping) test = grouped.apply(get_stats) bar = [] for i in range(10): bar.append(str(test[i]["min"]) + "-" + str(test[i]["max"])) group = [] for i in range(len(table)): for j in range(10): if (table["campaign_price"][i] >= test[j][0]) & (table["campaign_price"][i] <= test[j][1]): group.append(j) table["group"] = group fundraisings = [] proj_id = df["id"] color = [ "#98d86d", "#61Bf81", "#61bfbf", "#79aad0", "#41709e", "#cda7dd", "#a286c7", "#7154c0", "#aa67d1", "#d167b2" ] for i in range(len(df)): print(i) proj = proj_id[i] name = df["title"][i] url = df["url"][i] fund = df["funding_target"][i] now_fund = df["now_funding"][i] fund_ratio = (now_fund / fund) * 100 round_fund_ratio = round_up(fund_ratio) con = table["project"] == proj tab1 = table[con] tab1 = tab1.drop(["campaign_price", "project", "title", "id"], axis=1) tab1 = tab1.groupby(["group"]).sum() people = [] number = tab1.index for j in range(10): if (j in number): people.append(int(tab1["campaign_people"][j])) else: people.append(0) # id_num = [1,2,3,4,5,6,7,8,9,10] fund = { "id": int(i + 1), "color": color[i], "name": name, "url": url, "data": people, "proportion": round_fund_ratio } fundraisings.append(fund) data1 = interval(df) minnum = data1["mininterval"] maxnum = data1["maxinterval"] chart = { "mininterval": int(minnum), "maxinterval": int(maxnum), "bar": bar, "fundraising": fundraisings } return chart
df_train[['isInfant', 'Survived']].groupby('isInfant').mean() # In[ ]: df_train[['isKid', 'Survived']].groupby('isKid').mean() # In[ ]: df_train[['isOld', 'Survived']].groupby('isOld').mean() # now create the new 'AgeBand' feature # In[ ]: for df in df_combine: df['tmpt_AgeBand'] = pd.qcut(df['Age'], 4) df_train.head() df_train[['tmpt_AgeBand', 'Survived' ]].groupby(['tmpt_AgeBand'], as_index=False).mean().sort_values(by='tmpt_AgeBand', ascending=True) # In[ ]: df_train.drop(labels='tmpt_AgeBand', inplace=True, axis=1) # In[ ]: for df in df_combine: df['AgeBand'] = 0 df.loc[df.Age <= 21, 'AgeBand'] = 0
s = r.std(ddof=0).shift(1) z = (x - m) / s min = np.min(z) max = np.max(z) z = (z - min) / (max - min) z = z * 2 - 1 return z spy['zscore'] = zscore(spy['delta'], window=36) spy['zscore'].plot(figsize=figsize) plt.legend() # %% spy = spy.dropna() print spy['zscore'].describe() bottom = np.percentile(spy['zscore'], 20) high = np.percentile(spy['zscore'], 80) print bottom, high # %% plt.figure(figsize=figsize) worst_days = spy['zscore'] < bottom spy['price'].plot() spy.loc[worst_days, 'price'].plot() plt.show() # %% plt.figure(figsize=figsize) spy['fwd returns'].groupby(pd.qcut(spy['zscore'], 10)).mean().plot(kind='bar') #%%