def sum_df_sequence(seq: Iterable[DataFrame], fill_value: Union[int, float] = 0) -> DataFrame: """Sums over a sequence of DataFrames, even if they have different indexes or columns, filling in 0 (or a value of your choice) for missing rows or columns. Useful when you have a sequence of DataFrames which are supposed to have the same indexes and columns but might be missing a few values. Args: seq (Iterable[pandas.DataFrame]): Any iterable of DataFrame type, ordered or unordered. fill_value (Union[int, float], optional): Defaults to ``0``. The value to use for missing cells. Returns: pandas.DataFrame: The sum over all items in seq. """ common_index = Index([]) common_columns = Index([]) accumulator = DataFrame() for df in seq: if not df.index.equals(common_index): common_index |= df.index accumulator = accumulator.reindex_axis(common_index, axis=0, fill_value=fill_value) df = df.reindex_axis(common_index, axis=0, fill_value=fill_value) if not df.columns.equals(common_columns): common_columns |= df.columns accumulator = accumulator.reindex_axis(common_columns, axis=1, fill_value=fill_value) df = df.reindex_axis(common_columns, axis=1, fill_value=fill_value) accumulator += df return accumulator
def _prepare_observations(cls, feature_extractor, text_classes, assessments): texts, classes = zip(*text_classes) features_lists = {fn:[] for fn in feature_extractor.FEATURES} for text in texts: features = feature_extractor.extract(text) assert features_lists.keys() == features.keys() for feature_name, value in features.items(): features_lists[feature_name].append(value) features_pd = DataFrame(features_lists) # ensure proper column order features_pd.reindex_axis(feature_extractor.FEATURES, axis=1) # Convert classes to a categorical ass_map = {c:i for i, c in enumerate(assessments)} # Int map ass_int = [ass_map[c] for c in classes] # Convert to int classes_pd = DataFrame( {'class': Categorical(ass_int, levels=assessments)}, index = list(range(1, len(ass_int)+1))) # return features_pd, classes_pd
def parse_splits(fname_or_df, outname=None): """From csv with columns name, break, yield a df with start, end and length. Parameters: fname_or_df: filename. If df is given, recalculate length. Returns: df with name, start, end, length. Units are in milliseconds. """ if type(fname_or_df) == str: log = DataFrame.from_csv(fname_or_df, index_col=None) else: log = fname_or_df if 'break' in log: df = DataFrame({'name': log['name'][0:-1].values, 'start': log['break'][0:-1].values, 'end': log['break'][1:].values, 'order': range(len(log) - 1)}) elif 'start' in log: df = log else: raise Exception('needs to either have break, or start and end columns') # if splits are given in time, convert to millisecond if ':' in str(df['start'][0]): for col in ['start', 'end']: df[col] = df[col].apply(time2ms) # calculate length df['length'] = df['end'] - df['start'] assert all(col in df for col in OUTCOLS) if outname: df.to_csv(outname, index=None) return df.reindex_axis(OUTCOLS, axis=1)
def __reindex_cols_old_api(df: pd.DataFrame, partial_ordering: Sequence[str]) -> pd.DataFrame: return df.reindex_axis(sorted( df.columns, key=lambda col_name: __element_order(col_name, partial_ordering)), axis=1, copy=False)
def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(list(range(15))) assert np.isnan(result.values[-5:]).all() result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=range(5), fill_value=0.) expected = df.copy() expected[4] = 0. assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value=0) expected = df.copy() expected[4] = 0 assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value='foo') expected = df.copy() expected[4] = 'foo' assert_frame_equal(result, expected) # reindex_axis with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(range(15), fill_value=0., axis=0) expected = df.reindex(range(15)).fillna(0) assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(range(5), fill_value=0., axis=1) expected = df.reindex(columns=range(5)).fillna(0) assert_frame_equal(result, expected) # other dtypes df['foo'] = 'foo' result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) assert_frame_equal(result, expected)
def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(lrange(15)) assert np.isnan(result.values[-5:]).all() result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=lrange(5), fill_value=0.) expected = df.copy() expected[4] = 0. assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value=0) expected = df.copy() expected[4] = 0 assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value='foo') expected = df.copy() expected[4] = 'foo' assert_frame_equal(result, expected) # reindex_axis with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(lrange(15), fill_value=0., axis=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(lrange(5), fill_value=0., axis=1) expected = df.reindex(columns=lrange(5)).fillna(0) assert_frame_equal(result, expected) # other dtypes df['foo'] = 'foo' result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected)
def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(lrange(15)) self.assertTrue(np.isnan(result.values[-5:]).all()) result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=lrange(5), fill_value=0.0) expected = df.copy() expected[4] = 0.0 assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value=0) expected = df.copy() expected[4] = 0 assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value="foo") expected = df.copy() expected[4] = "foo" assert_frame_equal(result, expected) # reindex_axis result = df.reindex_axis(lrange(15), fill_value=0.0, axis=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) result = df.reindex_axis(lrange(5), fill_value=0.0, axis=1) expected = df.reindex(columns=lrange(5)).fillna(0) assert_frame_equal(result, expected) # other dtypes df["foo"] = "foo" result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected)
def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(lrange(15)) self.assertTrue(np.isnan(result.values[-5:]).all()) result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=lrange(5), fill_value=0.) expected = df.copy() expected[4] = 0. assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value=0) expected = df.copy() expected[4] = 0 assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value='foo') expected = df.copy() expected[4] = 'foo' assert_frame_equal(result, expected) # reindex_axis result = df.reindex_axis(lrange(15), fill_value=0., axis=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) result = df.reindex_axis(lrange(5), fill_value=0., axis=1) expected = df.reindex(columns=lrange(5)).fillna(0) assert_frame_equal(result, expected) # other dtypes df['foo'] = 'foo' result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected)
def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({ 'a': { 0: 1, 1: 0, 2: 0 }, 'b': { 0: 0, 1: 1, 2: 0 } }, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame( { nan: { 0: 0, 1: 0, 2: 1 }, 'a': { 0: 1, 1: 0, 2: 0 }, 'b': { 0: 0, 1: 1, 2: 0 } }, dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values self.dates = baseline.index self.assets = baseline.columns if adjustments is None: adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort(["apply_date", "sid"], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid)
def aggregate_indicators(df, index_cols, unstack_col, group_col, metric_col, label): df = df.copy() agg = DataFrame() aggw = DataFrame(index=df['indicator'].unique(), columns=df['grade'].unique()) grp = df.set_index(index_cols + [unstack_col])[metric_col]\ .unstack(unstack_col)\ .groupby(level=group_col, group_keys=False) for grade, grp_df in grp: weights, grp_df = combine_subjects_by_rules(grp_df, constituent=int(grade)) grp_df = grp_df.to_frame(metric_col).reset_index() grp_df[unstack_col] = label agg = agg.append(grp_df) aggw[grade] = weights agg = agg.reindex_axis(df.columns, axis=1) return agg, aggw.fillna(0.0) / aggw.sum()
def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values self.dates = baseline.index self.assets = baseline.columns if adjustments is None: adjustments = DataFrame( index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS, ) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort(['apply_date', 'sid'], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid)
def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def get_post_hocs_as_df(df: pd.DataFrame, dm: str, factor: str, tag: str = '') -> pd.DataFrame: table = sm.stats.multicomp.pairwise_tukeyhsd(df.dropna()[dm], df.dropna()[factor]) pvalues = pvalues_from_tukeyhsd(table) table_csv = table._results_table.as_csv() table_csv = table_csv[table_csv.find('\n') + 1:].replace(' ', '') df = pd.read_csv(StringIO(table_csv), delimiter=',') c = df.columns.tolist() df['tag'] = str(tag) df['p'] = [float(p) for p in pvalues ] if type(pvalues) is not float else [float(str(pvalues))] df['dv'] = dm df['factor'] = factor df['sig'] = [p_desc2(p) for p in df['p']] df = df.reindex_axis(['tag', 'dv', 'factor'] + c + ['p', 'sig'], axis=1) df.drop(['reject'], axis=1, inplace=True) return df
def show_predcition_matrix(prediction: pd.DataFrame) -> None: prediction = prediction.sort_index() prediction = prediction.reindex_axis(sorted(prediction.columns), axis=1) obs_pre = {0: {0: 0, 1: 2}, 1: {0: 3, 1: 1}} plt.figure() axis = plt.gca() cmap = colors.ListedColormap(['white', 'cornflowerblue', 'red', 'darkorange']) bounds = [-0.5, 0.5, 1.5, 2.5, 3.5] norm = colors.BoundaryNorm(bounds, cmap.N) heatmap = plt.pcolor(array(prediction), cmap=cmap, norm=norm, edgecolors='k', linewidth=2) for y in range(array(prediction.shape)[0]): for x in range(array(prediction.shape)[1]): if array(prediction)[y, x] == np.nan: continue if array(prediction)[y, x] >= 0: plt.text(x+0.5, y+0.5, array(prediction)[y, x], horizontalalignment='center', verticalalignment='center') plt.yticks(arange(0.5, len(prediction.index), 1), prediction.index) plt.xticks(arange(0.5, len(prediction.columns), 1), prediction.columns, rotation=70) plt.xlabel('Cohesin name', style='oblique') plt.ylabel('Dockerin name', style='oblique') axis.set_aspect('equal') plt.title('Cohesin dockerin cross binding') plt.suptitle('0: obs no pred no, 1: obs yes, pred yes\n2: obs no pred yes, 3: obs yes pred no') plt.show()
def FindKeywords(self, support=10, ngrams=1): """Identify and rank keywords within target and non-target sets.""" if type(ngrams) == int: ngrams = (1, ngrams) if type(ngrams) == list: ngrams = tuple(ngrams) frequent_words = GetDTM(self.search_set, min_df=support, ngram_range=ngrams).terms dtm = GetDTM(self.search_set, min_df=1, vocabulary=frequent_words) total_dc = DocCounts(frequent_words, dtm, None) target_dc = DocCounts(frequent_words, dtm, self.target_docnames) ntarget = len(self.target_docnames) nnontarget = len(self.nontarget_docnames) alpha1 = 1 alpha0 = 1 ranked_by = 'll' target_wordlist = [] nontarget_wordlist = [] target_stats = defaultdict(list) nontarget_stats = defaultdict(list) for word in frequent_words: n1 = target_dc[word] n0 = total_dc[word] - target_dc[word] p1 = (float(n1)/ntarget)*100 p0 = (float(n0)/nnontarget)*100 n1_not = ntarget - n1 n0_not = nnontarget - n0 ll = (lgamma(n1+alpha1) + lgamma(n0+alpha0) - lgamma(n1+alpha1+n0+alpha0)) + (lgamma(n1_not+alpha1) + lgamma(n0_not+alpha0) - lgamma(n1_not+alpha1+n0_not+alpha0)) if hasattr(self, 'reference_keywords'): r_count = 0 if word in self.reference_keywords: r_count = self.reference_stats.loc[word, 'counts'] else: r_count = None if p0 > p1: p1, p0 = p0, p1 n1, n0 = n0, n1 nontarget_wordlist.append(word) nontarget_stats['n1'].append(n1) nontarget_stats['n0'].append(n0) nontarget_stats['p1'].append(p1) nontarget_stats['p0'].append(p0) nontarget_stats['ll'].append(ll) nontarget_stats['T'].append(n0) nontarget_stats['S'].append(n0+n1) nontarget_stats['R'].append(r_count) else: target_wordlist.append(word) target_stats['n1'].append(n1) target_stats['n0'].append(n0) target_stats['p1'].append(p1) target_stats['p0'].append(p0) target_stats['ll'].append(ll) target_stats['T'].append(n1) target_stats['S'].append(n0+n1) target_stats['R'].append(r_count) target_stats = DataFrame(target_stats, index=target_wordlist) target_stats = target_stats.reindex_axis(['ll', 'n1', 'n0', 'p1', 'p0','T','S','R'], axis=1) target_stats.sort_values(ranked_by, ascending=False, inplace=True) nontarget_stats = DataFrame(nontarget_stats, index=nontarget_wordlist) nontarget_stats = nontarget_stats.reindex_axis(['ll', 'n1', 'n0', 'p1', 'p0','T','S','R'], axis=1) nontarget_stats.sort_values(ranked_by, ascending=False, inplace=True) if hasattr(self, 'reference_keywords'): ref_words = self.reference_keywords ref_dtm = GetDTM(self.search_set, min_df=1, vocabulary=ref_words) total_dc = DocCounts(ref_words, ref_dtm, None) target_dc = DocCounts(ref_words, ref_dtm, self.target_docnames) ref_T = [] ref_S = [] for word in ref_words: ref_T.append(target_dc[word]) ref_S.append(total_dc[word]) self.reference_stats['T'] = ref_T self.reference_stats['S'] = ref_S self.reference_stats['R'] = self.reference_stats['counts'] self.target_stats = target_stats self.nontarget_stats = nontarget_stats self.target_keywords = list(target_stats.index) self.nontarget_keywords = list(nontarget_stats.index) print("{} target set keywords found".format(len(self.target_keywords))) print("{} non-target set keywords found".format(len(self.nontarget_keywords)))
def clean_pw_offday(pw_offday, weeklookup, pw_slp2): ''' Clean pw_offday query without filtering out non-off-days invoice-level => day level => customer level ''' print('*'*100) print('Cleaning pw_offday query and creating summaries.') print('*'*100) deliveries = pw_offday print('\n\n\nDeclaring functions for later use.') def as400_date(dat): '''Accepts date as formatted in AS400''' dat = str(dat) dat = dat[-6:] dat = dt.date(dt.strptime(dat, '%y%m%d')) return dat def sum_digits_in_string(digit): return sum(int(x) for x in digit if x.isdigit()) print('Mapping Columns.') deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 'CSHP':'Ship', '#MSLSP':'SalespersonId', 'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True) pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, inplace=True) deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left') print('Mapping Customer types.') typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\ 'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\ 'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\ 'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\ 'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\ '7':'Sports Venue'} deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map) print('Mapping Warehouse names.') whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'} deliveries.Warehouse = deliveries.Warehouse.map(whs_map) print('Processing dates.') deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()] weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()] print('Merging on dates with week lookup.') deliveries = deliveries.merge(weeklookup, on='Date') dat = Series(deliveries.Date.tolist()) deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat]) week_plan = deliveries.ShipWeekPlan.tolist() week_shipped = deliveries.ShipWeek.tolist() print('Using custom logic to derive which days were off-day deliveries.') deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()] mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'}) tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'}) wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'}) thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'}) fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'}) sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'}) sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'}) deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun])) weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()] _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun, day_list = _days['WeekPlanned'].tolist() _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list] _week_actual = _days.WeekShipped.tolist() _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())] _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)] off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)] off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)] off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)] off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)] off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)] off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)] off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)] _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday}) _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']] _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True) print('Check here if you suspect a bug.') #check_later = _off_days[_off_days['OffDayDelivery'] == True] print('Mapping Call Codes.') deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1) deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'}) print('Putting Setup Date into proper date format.') setup_date = deliveries.CustomerSetup.astype(str).tolist() setup_month = Series([d.zfill(4)[:2] for d in setup_date]) setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date] deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)] print('Defining new customers based on whether they were setup last month or not.') if dt.now().month == 1: last_month = '12' else: last_month = str(dt.now().month - 1).zfill(2) if dt.now().month == 1: this_year = str(dt.now().year - 1) else: this_year = str(dt.now().year) m_y_cutoff = last_month + '-' + this_year deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup] deliveries['OffDayDeliveries'] = deliveries.OffDayDelivery.astype(int) print('Deriving number of weekly deliveries allotted to each customer.') _n_days = deliveries.Ship.astype(str).tolist() deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days] _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist() _week_ind = deliveries['ShipWeekPlan'].tolist() deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)] _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict() print('\n') print('-'*100) print('\n') print('Aggregating by Day.') agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 'Date' : {'Count':len_unique}, 'Cases' : {'Sum':sum, 'Avg':np.mean}, 'Dollars' : {'Sum':sum, 'Avg':np.mean}, 'NewCustomer': lambda x: min(x)} pass_through_cols = ['CustomerId','Customer','Week','Date'] _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False) _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']]) _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns] _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg'] _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int) _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days) print('Aggregating by Week.') agg_funcs_week = {'OffDayDelivery' : {'Count':sum}, 'Delivery' : {'Count':sum}, 'NewCustomer' : lambda x: min(x)} _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False) _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns] print('Mapping number of deliveries to Customers.') # Map number of total deliveries each week by customer # to determine whether a customer with TWR deliveries # got TWF deliveries -- which is an off-day delivery # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total} _c = _agg_byweek['CustomerId'].astype(str).tolist() _w = _agg_byweek['Week'].astype(str).tolist() _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)] by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict() cid = _agg_byday['CustomerId'].astype(str).tolist() wkk = _agg_byday['Week'].astype(str).tolist() _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)] _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map)) print('Using custom logic to define Additional Delivery Days.') addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] ) addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] ) addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 ) addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 ) addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] ) _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int) print('Aggregating by Customer.') agg_funcs_cust = {'OffDayDelivery' : {'Count':sum}, 'Delivery' : {'Count':sum}, 'NewCustomer' : lambda x: min(x), 'AllottedWeeklyDeliveryDays|Count': lambda x: max(x), 'AdditionalDeliveryDays': lambda x: sum(x), 'Dollars|Sum':lambda x: int(sum(x)), 'Cases|Sum':lambda x: sum(x) } _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False) _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns] _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1) _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases', 'Customer','CustomerId','Deliveries','Dollars', 'NewCustomer','OffDayDeliveries'] _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries', 'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']] print('Mapping useful Customer attributes.') attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays'] customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True) _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates() _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True) _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries'] _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2) _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2) _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2) print('Mapping Tiers based on allotted delivery days.') tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'} _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map) addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist() tier = _agg_bycust['Tier'].tolist() _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)] _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '') print('Creating Overall Summary.') agg_funcs_summary = {'Deliveries':sum, 'OffDayDeliveries':sum, 'AdditionalDeliveries':sum, 'Dollars':{'Avg':np.mean}, 'Cases':{'Avg':np.mean}, 'CasesPerDelivery':{'Avg':np.mean}, 'NewCustomer':sum, 'Customer':len, 'AllottedDeliveryDays':lambda x: round(np.mean(x),1)} overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary)) overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns] overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>', 'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum', 'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']] overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries', 'Cases|mean','CasesPerDelivery|mean','Dollars|mean'] print('Creating High-Level Summary.\n\n\n') agg_funcs_HL_summary = {'Deliveries':sum, 'OffDayDeliveries':sum, 'AdditionalDeliveries':sum, 'Dollars':{'Avg':np.mean}, 'Cases':{'Avg':np.mean}, 'CasesPerDelivery':{'Avg':np.mean}, 'NewCustomer':sum, 'Customer':len, 'AllottedDeliveryDays':lambda x: round(np.mean(x),1)} high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary)) high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns] high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>', 'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum', 'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']] high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries', 'Cases|mean','CasesPerDelivery|mean','Dollars|mean'] print('*'*100) print('Finished creating summaries at high level, overall, and aggregating by customer and by day.') print('*'*100) return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries
df.sort(columns=['one','two'], ascending=[0,1]) prices = [101.0,102.0,103.0] tickers = ['GOOG','AAPL'] data = [v for v in itertools.product(tickers,prices)] dates = pandas.date_range('2013-01-03',periods=3) df = DataFrame(data, columns=['ticker','price']) df['dates'] = dates.append(dates) df df.pivot(index='dates',columns='ticker',values='price') original = DataFrame([[1,1],[2,2],[3.0,3]],index=['a','b','c'], columns=['one','two']) original.reindex(index=['b','c','d']) different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two']) original.reindex_like(different) original.reindex_axis(['two','one'], axis = 1) left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two']) right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three']) left.merge(right,on='one') # Same as how='inner' left.merge(right,on='one', how='left') left.merge(right,on='one', how='right') left.merge(right,on='one', how='outer') left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two']) left right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3]) right left.update(right) # Updates values in left left
data = [v for v in itertools.product(tickers, prices)] dates = pandas.date_range('2013-01-03', periods=3) df = DataFrame(data, columns=['ticker', 'price']) df['dates'] = dates.append(dates) df df.pivot(index='dates', columns='ticker', values='price') original = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=['a', 'b', 'c'], columns=['one', 'two']) original.reindex(index=['b', 'c', 'd']) different = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=['c', 'd', 'e'], columns=['one', 'two']) original.reindex_like(different) original.reindex_axis(['two', 'one'], axis=1) left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=['one', 'two']) right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=['one', 'three']) left.merge(right, on='one') # Same as how='inner' left.merge(right, on='one', how='left') left.merge(right, on='one', how='right') left.merge(right, on='one', how='outer') left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=['one', 'two']) left right = DataFrame([[nan, 12], [13, nan], [nan, 8]], columns=['one', 'two'], index=[1, 2, 3]) right left.update(right) # Updates values in left
def clean_pw_offday(pw_offday, weeklookup): ''' Clean pw_offday query without filtering out non-off-days invoice-level => day level => customer level ''' print('*' * 100) print('Cleaning pw_offday query and creating summaries.') print('*' * 100) deliveries = pw_offday print('\n\n\nDeclaring functions for later use.') def as400_date(dat): '''Accepts date as formatted in AS400''' dat = str(dat) dat = dat[-6:] dat = dt.date(dt.strptime(dat, '%y%m%d')) return dat def sum_digits_in_string(digit): return sum(int(x) for x in digit if x.isdigit()) print('Mapping Columns.') deliveries.columns = [ 'Date', 'Division', 'Invoice', 'CustomerId', 'Call', 'Priority', 'Warehouse', 'Cases', 'Dollars', 'Ship', 'Salesperson', 'ShipWeekPlan', 'Merchandising', 'OnPremise', 'CustomerSetup', 'CustomerType', 'Customer' ] print('Mapping Customer types.') typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\ 'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\ 'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\ 'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\ 'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\ '7':'Sports Venue'} deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map) print('Mapping Warehouse names.') whs_map = { 1: 'Kansas City', 2: 'Saint Louis', 3: 'Columbia', 4: 'Cape Girardeau', 5: 'Springfield' } deliveries.Warehouse = deliveries.Warehouse.map(whs_map) print('Processing dates.') deliveries.Date = [ as400_date(d) for d in deliveries.Date.astype(str).tolist() ] weeklookup['Date'] = [ dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist() ] print('Merging on dates with week lookup.') deliveries = deliveries.merge(weeklookup, on='Date') dat = Series(deliveries.Date.tolist()) deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat]) week_plan = deliveries.ShipWeekPlan.tolist() week_shipped = deliveries.ShipWeek.tolist() print('Using custom logic to derive which days were off-day deliveries.') deliveries.Ship = del_days = [ str('%07d' % int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist() ] mon = Series([d[-7:][:1] for d in del_days]).map({'1': 'M', '0': '_'}) tue = Series([d[-6:][:1] for d in del_days]).map({'1': 'T', '0': '_'}) wed = Series([d[-5:][:1] for d in del_days]).map({'1': 'W', '0': '_'}) thu = Series([d[-4:][:1] for d in del_days]).map({'1': 'R', '0': '_'}) fri = Series([d[-3:][:1] for d in del_days]).map({'1': 'F', '0': '_'}) sat = Series([d[-2:][:1] for d in del_days]).map({'1': 'S', '0': '_'}) sun = Series([d[-1:][:1] for d in del_days]).map({'1': 'U', '0': '_'}) deliveries['DeliveryDays'] = del_days = list( itertools.chain.from_iterable( [mon + tue + wed + thu + fri + sat + sun])) weekday = deliveries.Weekday = [ d[:3] for d in deliveries.Weekday.astype(str).tolist() ] _days = DataFrame( data={ 'Weekday': weekday, 'WeekPlanned': week_plan, 'WeekShipped': week_shipped, 'DelDays': del_days } ) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun, day_list = _days['WeekPlanned'].tolist() _days['WeekPlanned'] = [d if d in ['A', 'B'] else '' for d in day_list] _week_actual = _days.WeekShipped.tolist() _week_plan = _days['WeekPlanned'] = [ ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual, _days.WeekPlanned.tolist()) ] _days['OffWeek'] = _off_week = [ p != a for p, a in zip(_week_plan, _week_actual) ] off_mon = [ str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday) ] off_tue = [ str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday) ] off_wed = [ str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday) ] off_thu = [ str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday) ] off_fri = [ str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday) ] off_sat = [ str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday) ] off_sun = [ str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday) ] _off_days = DataFrame({ 'Mon': off_mon, 'Tue': off_tue, 'Wed': off_wed, 'Thu': off_thu, 'Fri': off_fri, 'Sat': off_sat, 'Sun': off_sun, 'OffWeek': _off_week, 'Weekday': weekday }) _off_days = _off_days[[ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Weekday', 'OffWeek' ]] _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | ( _off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | ( _off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | ( _off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True) print('Check here if you suspect a bug.') #check_later = _off_days[_off_days['OffDayDelivery'] == True] print('Mapping Call Codes.') deliveries = pd.concat( [deliveries, _off_days[['OffWeek', 'OffDayDelivery']]], axis=1) deliveries.Call = deliveries.Call.map({ 1: 'Customer Call', 2: 'ROE/EDI', 3: 'Salesperson Call', 4: 'Telesales' }) print('Putting Setup Date into proper date format.') setup_date = deliveries.CustomerSetup.astype(str).tolist() setup_month = Series([d.zfill(4)[:2] for d in setup_date]) setup_year = Series([ "20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date ]) #this_century = [int(d[-2:]) < 20 for d in setup_date] deliveries['CustomerSetup'] = c_setup = [ str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year) ] print( 'Defining new customers based on whether they were setup last month or not.' ) if dt.now().month == 1: last_month = '12' else: last_month = str(dt.now().month - 1).zfill(2) if dt.now().month == 1: this_year = str(dt.now().year - 1) else: this_year = str(dt.now().year) m_y_cutoff = last_month + '-' + this_year deliveries['NewCustomer'] = [ 1 if m_y_cutoff == setup else 0 for setup in c_setup ] deliveries['OffDayDeliveries'] = deliveries.OffDayDelivery.astype(int) print('Deriving number of weekly deliveries allotted to each customer.') _n_days = deliveries.Ship.astype(str).tolist() deliveries['AllottedWeeklyDeliveryDays'] = [ sum_digits_in_string(n) for n in _n_days ] _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist() _week_ind = deliveries['ShipWeekPlan'].tolist() deliveries['AllottedWeeklyDeliveryDays'] = [ a if w not in ['A', 'B'] else 0.5 for a, w in zip(_allot, _week_ind) ] _n_days = deliveries.set_index( 'CustomerId')['AllottedWeeklyDeliveryDays'].to_dict() print('\n') print('-' * 100) print('\n') print('Aggregating by Day.') len_unique = lambda x: len(pd.unique(x)) agg_funcs_day = { 'OffDayDeliveries': { 'Count': max }, 'Date': { 'Count': len_unique }, 'Cases': { 'Sum': sum, 'Avg': np.mean }, 'Dollars': { 'Sum': sum, 'Avg': np.mean }, 'NewCustomer': lambda x: min(x) } pass_through_cols = ['CustomerId', 'Customer', 'Week', 'Date'] _agg_byday = DataFrame( deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index( drop=False) _agg_byday = DataFrame(_agg_byday[[ 'CustomerId', 'Customer', 'Week', 'Date', 'OffDayDeliveries', 'NewCustomer', 'Cases', 'Dollars' ]]) _agg_byday.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns ] _agg_byday.columns = [ 'CustomerId', 'Customer', 'Week', 'Date', 'Delivery', 'OffDayDelivery', 'NewCustomer', 'Cases|Sum', 'Cases|Avg', 'Dollars|Sum', 'Dollars|Avg' ] _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday[ 'CustomerId'].astype(int) _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday[ 'AllottedWeeklyDeliveryDays|Count'].map(_n_days) print('Aggregating by Week.') agg_funcs_week = { 'OffDayDelivery': { 'Count': sum }, 'Delivery': { 'Count': sum }, 'NewCustomer': lambda x: min(x) } _agg_byweek = DataFrame( _agg_byday.groupby(['CustomerId', 'Week' ]).agg(agg_funcs_week)).reset_index(drop=False) _agg_byweek.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns ] print('Mapping number of deliveries to Customers.') # Map number of total deliveries each week by customer # to determine whether a customer with TWR deliveries # got TWF deliveries -- which is an off-day delivery # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total} _c = _agg_byweek['CustomerId'].astype(str).tolist() _w = _agg_byweek['Week'].astype(str).tolist() _agg_byweek['_X'] = [c + ',' + w for c, w in zip(_c, _w)] by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict() cid = _agg_byday['CustomerId'].astype(str).tolist() wkk = _agg_byday['Week'].astype(str).tolist() _agg_byday['N_DeliveriesThisWeek'] = [ c + ',' + w for c, w in zip(cid, wkk) ] _agg_byday['N_DeliveriesThisWeek'] = _agg_byday[ 'N_DeliveriesThisWeek'].map(Series(by_week_map)) print('Using custom logic to define Additional Delivery Days.') addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId']) addl_day_criteria_2 = (_agg_byday.shift(1)['Week'] == _agg_byday['Week']) addl_day_criteria_3 = (_agg_byday['OffDayDelivery'] == 1) addl_day_criteria_4 = (_agg_byday['NewCustomer'] != 1) addl_day_criteria_5 = (_agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count']) _agg_byday['AdditionalDeliveryDays'] = Series( addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int) print('Aggregating by Customer.') agg_funcs_cust = { 'OffDayDelivery': { 'Count': sum }, 'Delivery': { 'Count': sum }, 'NewCustomer': lambda x: min(x), 'AllottedWeeklyDeliveryDays|Count': lambda x: max(x), 'AdditionalDeliveryDays': lambda x: sum(x), 'Dollars|Sum': lambda x: int(sum(x)), 'Cases|Sum': lambda x: sum(x) } _agg_bycust = DataFrame( _agg_byday.groupby(['CustomerId', 'Customer' ]).agg(agg_funcs_cust)).reset_index(drop=False) _agg_bycust.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns ] _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1) _agg_bycust.columns = [ 'AdditionalDeliveries', 'AllottedDeliveryDays', 'Cases', 'Customer', 'CustomerId', 'Deliveries', 'Dollars', 'NewCustomer', 'OffDayDeliveries' ] _agg_bycust = _agg_bycust[[ 'CustomerId', 'Customer', 'NewCustomer', 'AllottedDeliveryDays', 'Deliveries', 'OffDayDeliveries', 'AdditionalDeliveries', 'Cases', 'Dollars' ]] print('Mapping useful Customer attributes.') attr = [ 'CustomerId', 'Warehouse', 'OnPremise', 'CustomerSetup', 'CustomerType', 'ShipWeekPlan', 'DeliveryDays' ] customer_attributes = deliveries[attr].drop_duplicates().reset_index( drop=True) _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates() _agg_bycust = _agg_bycust.sort_values( by=['AdditionalDeliveries', 'OffDayDeliveries'], ascending=False).reset_index(drop=True) _agg_bycust[ 'CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries'] _agg_bycust['DollarsPerDelivery'] = round( _agg_bycust['Dollars'] / _agg_bycust['Deliveries'], 2) _agg_bycust['OffDayDeliveries/Deliveries'] = round( _agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'], 2) _agg_bycust['AdditionalDeliveries/Deliveries'] = round( _agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'], 2) print('Mapping Tiers based on allotted delivery days.') tier_map = { 0: 'No Delivery Days Assigned', 0.5: 'Tier 4', 1: 'Tier 3', 2: 'Tier 2', 3: 'Tier 1', 4: 'Tier 1', 5: 'Tier 1', 6: 'Tier 1', 7: 'Tier 1' } _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map) addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist() tier = _agg_bycust['Tier'].tolist() _agg_bycust['AdditionalDeliveries'] = [ addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier) ] _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace( np.nan, '') print('Creating Overall Summary.') agg_funcs_summary = { 'Deliveries': sum, 'OffDayDeliveries': sum, 'AdditionalDeliveries': sum, 'Dollars': { 'Avg': np.mean }, 'Cases': { 'Avg': np.mean }, 'CasesPerDelivery': { 'Avg': np.mean }, 'NewCustomer': sum, 'Customer': len, 'AllottedDeliveryDays': lambda x: round(np.mean(x), 1) } overall_summary = DataFrame( _agg_bycust.groupby(['Tier', 'Warehouse']).agg(agg_funcs_summary)) overall_summary.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns ] overall_summary = overall_summary[[ 'NewCustomer|sum', 'Customer|len', 'AllottedDeliveryDays|<lambda>', 'Deliveries|sum', 'OffDayDeliveries|sum', 'AdditionalDeliveries|sum', 'Cases|Avg', 'CasesPerDelivery|Avg', 'Dollars|Avg' ]] overall_summary.columns = [ 'NewCustomers', 'Customers', 'AvgAllottedDeliveryDays', 'Deliveries', 'OffDayDeliveries', 'AdditionalDeliveries', 'Cases|mean', 'CasesPerDelivery|mean', 'Dollars|mean' ] print('Creating High-Level Summary.\n\n\n') agg_funcs_HL_summary = { 'Deliveries': sum, 'OffDayDeliveries': sum, 'AdditionalDeliveries': sum, 'Dollars': { 'Avg': np.mean }, 'Cases': { 'Avg': np.mean }, 'CasesPerDelivery': { 'Avg': np.mean }, 'NewCustomer': sum, 'Customer': len, 'AllottedDeliveryDays': lambda x: round(np.mean(x), 1) } high_level_summary = DataFrame( _agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary)) high_level_summary.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns ] high_level_summary = high_level_summary[[ 'NewCustomer|sum', 'Customer|len', 'AllottedDeliveryDays|<lambda>', 'Deliveries|sum', 'OffDayDeliveries|sum', 'AdditionalDeliveries|sum', 'Cases|Avg', 'CasesPerDelivery|Avg', 'Dollars|Avg' ]] high_level_summary.columns = [ 'NewCustomers', 'Customers', 'AvgAllottedDeliveryDays', 'Deliveries', 'OffDayDeliveries', 'AdditionalDeliveries', 'Cases|mean', 'CasesPerDelivery|mean', 'Dollars|mean' ] print('*' * 100) print( 'Finished creating summaries at high level, overall, and aggregating by customer and by day.' ) print('*' * 100) return high_level_summary, overall_summary, _agg_bycust, _agg_byday
df.sort(columns=["one", "two"], ascending=[0, 1]) prices = [101.0, 102.0, 103.0] tickers = ["GOOG", "AAPL"] data = [v for v in itertools.product(tickers, prices)] dates = pandas.date_range("2013-01-03", periods=3) df = DataFrame(data, columns=["ticker", "price"]) df["dates"] = dates.append(dates) df df.pivot(index="dates", columns="ticker", values="price") original = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["a", "b", "c"], columns=["one", "two"]) original.reindex(index=["b", "c", "d"]) different = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["c", "d", "e"], columns=["one", "two"]) original.reindex_like(different) original.reindex_axis(["two", "one"], axis=1) left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"]) right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=["one", "three"]) left.merge(right, on="one") # Same as how='inner' left.merge(right, on="one", how="left") left.merge(right, on="one", how="right") left.merge(right, on="one", how="outer") left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"]) left right = DataFrame([[nan, 12], [13, nan], [nan, 8]], columns=["one", "two"], index=[1, 2, 3]) right left.update(right) # Updates values in left left
def aggregate_unsaleables_by_product(pwunsale_tidy, pwrct1_tidy, pw_ytdprod, pw_ytdsupp): ''' Aggregates unsaleables returns & dumps by product. Takes tidy data as input (previous function). ''' pwunsale = pwunsale_tidy pwrct1 = pwrct1_tidy print('Expect to see the following. \n\n\n') tot_unsaleable = np.sum(pwrct1_tidy['ExtCost']) returned = np.sum(pwunsale_tidy['ExtCost']) print('Total unsaleables expected: $%.2f' % tot_unsaleable) print('Total returns expected: $%.2f' % returned) print('\n\n\nAggregating RCT1 data by Product.') agg_funcs_product_rct = { 'CasesUnsaleable': { 'avg': np.mean, 'sum': np.sum }, 'ExtCost': { 'avg': np.mean, 'sum': np.sum } } grp_cols = ['SupplierId', 'Supplier', 'ProductId', 'Product'] _agg_byproduct_rct = DataFrame( pwrct1.groupby(grp_cols).agg(agg_funcs_product_rct).reset_index( drop=False)) _agg_byproduct_rct.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_rct.columns ] _agg_byproduct_rct = _agg_byproduct_rct.reindex_axis(sorted( _agg_byproduct_rct.columns), axis=1) _agg_byproduct_rct.columns = [ 'CasesUnsaleable|avg', 'CasesUnsaleable|sum', 'DollarsUnsaleable|avg', 'DollarsUnsaleable|sum', 'Product', 'ProductId', 'Supplier', 'SupplierId' ] print('\nUpdated unsaleables: $%.2f \n' % np.sum(_agg_byproduct_rct['DollarsUnsaleable|sum'])) print('Aggregating MTC data by Product.') agg_funcs_product_mtc = { 'CasesReturned': { 'avg': np.mean, 'sum': np.sum }, 'ExtCost': { 'avg': np.mean, 'sum': np.sum } } _agg_byproduct_mtc = DataFrame( pwunsale.groupby(grp_cols).agg(agg_funcs_product_mtc).reset_index( drop=False)) _agg_byproduct_mtc.columns = [ '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_mtc.columns ] _agg_byproduct_mtc = _agg_byproduct_mtc.reindex_axis(sorted( _agg_byproduct_mtc.columns), axis=1) _agg_byproduct_mtc.columns = [ 'CasesReturned|avg', 'CasesReturned|sum', 'DollarsReturned|avg', 'DollarsReturned|sum', 'Product', 'ProductId', 'Supplier', 'SupplierId' ] print('\nUpdated returns: $%.2f \n' % np.sum(_agg_byproduct_mtc['DollarsReturned|sum'])) print('Combining RCT and MTC data.') _agg_byproduct_combined = _agg_byproduct_rct.merge( _agg_byproduct_mtc.drop(labels=['Supplier', 'Product'], axis=1), on=['SupplierId', 'ProductId'], how='outer') _agg_byproduct_combined[['ProductId', 'SupplierId']] = _agg_byproduct_combined[[ 'ProductId', 'SupplierId' ]].astype(np.int) print('Merging in Directors on the SupplierId field.') _agg_byproduct_combined = _agg_byproduct_combined.merge( directors[['SupplierId', 'Director']], on='SupplierId', how='left') print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])) print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum'])) print('Reordering columns.') reorder_cols = [ 'Director', 'SupplierId', 'Supplier', 'ProductId', 'Product', 'DollarsUnsaleable|sum', 'CasesUnsaleable|sum', 'DollarsUnsaleable|avg', 'CasesUnsaleable|avg', 'DollarsReturned|sum', 'CasesReturned|sum', 'DollarsReturned|avg', 'CasesReturned|avg' ] _agg_byproduct_combined = _agg_byproduct_combined[reorder_cols] print('Mapping in attribute columns.') _attrs = ['ProductId', 'Size', 'Class', 'QPC'] _attributes = pwrct1[_attrs].drop_duplicates(subset='ProductId') _agg_byproduct_combined = _agg_byproduct_combined.merge(_attributes, on='ProductId', how='left') print('Mapping in YTD sales by Product.') _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdprod, on='ProductId', how='left') print('Mapping in YTD sales by Supplier.') _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdsupp, on='SupplierId', how='left') print('Deriving percenteage of sales by Product.') _agg_byproduct_combined['PercentSales|byproduct'] = np.divide( _agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|byproduct']) print('Deriving percenteage of sales by Suppplier.') _agg_byproduct_combined['PercentSales|bysupplier'] = np.divide( _agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|bysupplier']) print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])) print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum'])) print('Checking for and dropping Duplicates.') _agg_byproduct_combined.drop_duplicates(inplace=True) print('Replacing NaN values with zeros for readability.') _agg_byproduct_combined.fillna(0, inplace=True) print('Sorting in descending order on total unsaleables.\n\n\n') _agg_byproduct_combined.sort_values('DollarsUnsaleable|sum', ascending=False, inplace=True) print('Resetting index.') _agg_byproduct_combined.reset_index(inplace=True, drop=True) print('Compare values below to originals. \n\n\n') new_tot_unsaleable = np.sum( _agg_byproduct_combined['DollarsUnsaleable|sum']) new_returned = np.sum(_agg_byproduct_combined['DollarsReturned|sum']) print( 'Original Unsaleables: $%.2f \nPost-Processing Unsaleables: $%.2f \n' % (tot_unsaleable, new_tot_unsaleable)) print('Original Returns: $%.2f \nPost-Processing Returns: $%.2f \n\n\n' % (returned, new_returned)) print('*' * 100) print( 'If the numbers above do not match then there is a bug in the program.' ) print('*' * 100) return _agg_byproduct_combined
def aggregate_unsaleables_by_product(pwunsale_tidy, pwrct1_tidy, pw_ytdprod, pw_ytdsupp): ''' Aggregates unsaleables returns & dumps by product. Takes tidy data as input (previous function). ''' pwunsale = pwunsale_tidy pwrct1 = pwrct1_tidy print('Expect to see the following. \n\n\n') tot_unsaleable = np.sum(pwrct1_tidy['ExtCost']) returned = np.sum(pwunsale_tidy['ExtCost']) print('Total unsaleables expected: $%.2f' % tot_unsaleable) print('Total returns expected: $%.2f' % returned) print('\n\n\nAggregating RCT1 data by Product.') agg_funcs_product_rct = {'CasesUnsaleable': {'avg':np.mean, 'sum':np.sum}, 'ExtCost': {'avg':np.mean, 'sum':np.sum}} grp_cols = ['SupplierId','Supplier','ProductId','Product'] _agg_byproduct_rct = DataFrame(pwrct1.groupby(grp_cols).agg(agg_funcs_product_rct).reset_index(drop=False)) _agg_byproduct_rct.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_rct.columns] _agg_byproduct_rct = _agg_byproduct_rct.reindex_axis(sorted(_agg_byproduct_rct.columns), axis=1) _agg_byproduct_rct.columns = ['CasesUnsaleable|avg', 'CasesUnsaleable|sum', 'DollarsUnsaleable|avg', 'DollarsUnsaleable|sum', 'Product', 'ProductId', 'Supplier', 'SupplierId'] print('\nUpdated unsaleables: $%.2f \n' % np.sum(_agg_byproduct_rct['DollarsUnsaleable|sum'])) print('Aggregating MTC data by Product.') agg_funcs_product_mtc = {'CasesReturned': {'avg':np.mean, 'sum':np.sum}, 'ExtCost': {'avg':np.mean, 'sum':np.sum}} _agg_byproduct_mtc = DataFrame(pwunsale.groupby(grp_cols).agg(agg_funcs_product_mtc).reset_index(drop=False)) _agg_byproduct_mtc.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_mtc.columns] _agg_byproduct_mtc = _agg_byproduct_mtc.reindex_axis(sorted(_agg_byproduct_mtc.columns), axis=1) _agg_byproduct_mtc.columns = ['CasesReturned|avg', 'CasesReturned|sum', 'DollarsReturned|avg', 'DollarsReturned|sum', 'Product', 'ProductId', 'Supplier', 'SupplierId'] print('\nUpdated returns: $%.2f \n' % np.sum(_agg_byproduct_mtc['DollarsReturned|sum'])) print('Combining RCT and MTC data.') _agg_byproduct_combined = _agg_byproduct_rct.merge(_agg_byproduct_mtc.drop(labels=['Supplier','Product'], axis=1), on=['SupplierId','ProductId'], how='outer') _agg_byproduct_combined[['ProductId','SupplierId']] = _agg_byproduct_combined[['ProductId','SupplierId']].astype(np.int) print('Merging in Directors on the SupplierId field.') _agg_byproduct_combined = _agg_byproduct_combined.merge(directors[['SupplierId','Director']], on='SupplierId',how='left') print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])) print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum'])) print('Reordering columns.') reorder_cols = ['Director', 'SupplierId', 'Supplier', 'ProductId', 'Product', 'DollarsUnsaleable|sum', 'CasesUnsaleable|sum', 'DollarsUnsaleable|avg', 'CasesUnsaleable|avg', 'DollarsReturned|sum', 'CasesReturned|sum', 'DollarsReturned|avg', 'CasesReturned|avg'] _agg_byproduct_combined = _agg_byproduct_combined[reorder_cols] print('Mapping in attribute columns.') _attrs = ['ProductId', 'Size', 'Class', 'QPC'] _attributes = pwrct1[_attrs].drop_duplicates(subset='ProductId') _agg_byproduct_combined = _agg_byproduct_combined.merge(_attributes, on='ProductId', how='left') print('Mapping in YTD sales by Product.') _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdprod, on='ProductId', how='left') print('Mapping in YTD sales by Supplier.') _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdsupp, on='SupplierId', how='left') print('Deriving percenteage of sales by Product.') _agg_byproduct_combined['PercentSales|byproduct'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|byproduct']) print('Deriving percenteage of sales by Suppplier.') _agg_byproduct_combined['PercentSales|bysupplier'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|bysupplier']) print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])) print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum'])) print('Checking for and dropping Duplicates.') _agg_byproduct_combined.drop_duplicates(inplace=True) print('Replacing NaN values with zeros for readability.') _agg_byproduct_combined.fillna(0, inplace=True) print('Sorting in descending order on total unsaleables.\n\n\n') _agg_byproduct_combined.sort_values('DollarsUnsaleable|sum', ascending=False, inplace=True) print('Resetting index.') _agg_byproduct_combined.reset_index(inplace=True, drop=True) print('Compare values below to originals. \n\n\n') new_tot_unsaleable = np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']) new_returned = np.sum(_agg_byproduct_combined['DollarsReturned|sum']) print('Original Unsaleables: $%.2f \nPost-Processing Unsaleables: $%.2f \n' % (tot_unsaleable, new_tot_unsaleable)) print('Original Returns: $%.2f \nPost-Processing Returns: $%.2f \n\n\n' % (returned, new_returned)) print('*'*100) print('If the numbers above do not match then there is a bug in the program.') print('*'*100) return _agg_byproduct_combined