def make_column_table(cities, data): def append_detail_line(ct, city_name, prices): ct.append_detail( city=city_name, mean=prices.mean(), median=prices.median(), stddev=prices.std(), count=len(prices), ) ct = ColumnsTable(( ('city', 30, '%30s', ('city'), 'name of city'), ('mean', 7, '%7.0f', ('mean'), 'mean price across time periods'), ('median', 7, '%7.0f', ('median'), 'median price across time periods'), ('stddev', 7, '%7.0f', ('stddev'), 'standard deviation of prices across time periods'), ('count', 7, '%7.0f', ('count'), 'number of transactions across time periods'), )) for city in cities: in_city = data.city == city city_data = data[in_city] prices = city_data.price append_detail_line(ct, city, prices) # summary line is across all the cities append_detail_line(ct, '* all cities *', data.price) ct.append_legend() return ct
class ReportWithColumnsTable(object): def __init__(self, header_lines, column_defs, print_as_spaces, verbose=True): self._report = Report() self._header(header_lines) self._ct = ColumnsTable(column_defs, verbose) self._print_as_spaces = print_as_spaces def _header(self, header_lines): for line in header_lines: self._report.append(line) def append_detail(self, **kwds): # replace NaN with None with_spaces = { k: (None if self._print_as_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def write(self, path): self._t.append_legend() for line in self._t.iterlines(): self._report.append(line) self._report.write(path)
def __init__(self, header_lines, column_defs, print_as_spaces, verbose=True): self._report = Report() self._header(header_lines) self._ct = ColumnsTable(column_defs, verbose) self._print_as_spaces = print_as_spaces
def make_details(data, test_months, n_best, n_worst): 'return a ColumnTable' extra_info = [] feature_names = Features().ege_names(control.arg.features) columns_table = ColumnsTable(( ('test_month', 6, '%6s', ('test', 'month'), 'test month'), ('nth', 2, '%2d', (' ', 'n'), 'rank of feature (1 ==> more frequently included)'), ('probability', 4, '%4.1f', (' ', 'prob'), 'probability feature appears in a decision tree'), ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'), ), verbose=True) for test_month in test_months: value = data[ReductionKey(test_month)] if 'feature_importances' not in value.importances: # one month has an ensemble model # skip that month print 'chart a sees an unexpected ensemble model' print 'test_month', test_month print 'value', value print 'value.importance', value.importances print 'skipping the test month' print 'entering debugger' pdb.set_trace() importances = value.importances['feature_importances'] assert value.importances['features_group'] == control.arg.features, value model = value.model assert type(model) == ResultKeyGbr or type(model) == ResultKeyRfr sorted_indices = importances.argsort() # sorted first lowest, last highest for nth_best in xrange(n_best): if nth_best == len(feature_names): break index = sorted_indices[len(importances) - nth_best - 1] columns_table.append_detail( test_month=test_month, nth=nth_best + 1, probability=importances[index] * 100.0, feature_name=feature_names[index] ) extra_info.append([test_month, nth_best+1, importances[index]*100.0, feature_names[index]]) for nth in xrange(n_worst): break # skip, for now if nth == len(feature_names): break nth_worst = n_worst - nth - 1 index = sorted_indices[nth_worst] columns_table.append_detail( test_month=test_month, nth=len(importances) - nth_worst, probability=importances[index] * 100.0, feature_name=feature_names[index] ) if n_best > 1 or n_worst > 1: # insert blank line between test_months if more than 1 row in a month columns_table.append_detail() columns_table.append_legend() return columns_table, extra_info
def __init__(self, k, validation_month, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._report = Report() self._test = test self._header(k, validation_month, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'description', 'mae_validation', 'mae_query', 'mare_validation', 'mare_query', ) self._ct = ColumnsTable(columns=cd, verbose=True)
def make_details(data, test_months): "return a ColumnTable" columns_table = ColumnsTable( ( ("mean_prob", 5, "%5.2f", ("mean", "prob"), "mean probability feature appears in a decision tree"), ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"), ), verbose=True, ) mean_importance = make_mean_importance_by_feature(test_months) for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True): columns_table.append_detail(mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name) columns_table.append_legend() return columns_table
def __init__(self, validation_month, k, column_definitions, test): self._report = Report() self._header(validation_month, k) self._column_definitions = column_definitions self._test = test cd = self._column_definitions.defs_for_columns( 'median_absolute_error', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', ) self._ct = ColumnsTable(columns=cd, verbose=True)
class ChartCDReport(object): def __init__(self, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() cd = self._column_definitions.defs_for_columns( 'validation_month', 'rank', 'median_absolute_error', 'median_price', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', 'alpha', 'l1_ratio', 'units_X', 'units_y', ) self._ct = ColumnsTable(columns=cd, verbose=True) self._header() def append(self, line): self._report.append(line) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def _header(self): self._report.append( 'Median Absolute Error (MAE) by month for best-performing models and their hyperparameters' ) self._report.append(' ') def append_detail(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces)
def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'mae_index0', 'mae_ensemble', 'mae_best_next_month', 'median_price', 'fraction_median_price_next_month_index0', 'fraction_median_price_next_month_ensemble', 'fraction_median_price_next_month_best', ) self._ct = ColumnsTable(columns=cd, verbose=True)
def make_report(title, ordered_cities): def make_detail_line(city): return { 'city': city, 'median_price': median_prices[city], 'median_price_index': median_prices_indices[city], 'n_trades': n_trades[city], 'n_trades_index': n_trades_indices[city], } c = ColumnsTable(( ('city', 30, '%30s', ('', '', '', '', '', 'City'), 'city name'), ('median_price', 7, '%7.0f', ('', '', '', '', 'median', 'price'), 'median price in city'), ('median_price_index', 7, '%7.2f', ('median', 'price', '/', 'overall', 'median', 'price'), 'median price as fraction of overall median price'), ('n_trades', 7, '%7.0f', ('', '', '', '', 'number', 'trades'), 'number of trades across all months'), ('n_trades_index', 7, '%7.2f', ('number', 'trades', '/ ', 'overall', 'median', 'trades'), 'median number trades as fraction of overall median number of trades' ), )) for city in ordered_cities: c.append_detail(**make_detail_line(city)) c.append_legend(40) r = Report() r.append(title) r.append(' ') for line in c.iterlines(): r.append(line) return r
def make_details(data, control): 'return a ColumnsTable' def append_feature_group_description(ct): ct.append_line(' ') ct.append_line('Features groups;') ct.append_line('s : only size features') ct.append_line('sw : only size and wealth features') ct.append_line('swp : only size, wealth, and property features') ct.append_line('swpn : all features: size, wealth, property, and neighborhood') ct = ColumnsTable(( ('month', 6, '%6s', ('', 'month'), 'training month'), ('features', 8, '%8s', ('features', 'group'), 'group of features'), ('model', 5, '%5s', ('best', 'model'), 'family of best model'), ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'), ), verbose=True, ) for month in control.test_months: for features in control.feature_groups: mae_model = data[month][features] ct.append_detail( month=month, features=features, model=mae_model.model, mae=mae_model.mae, ) ct.append_detail() # blank line separates each month ct.append_legend() append_feature_group_description(ct) return ct
class ChartEReport(object): def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'model', 'n_months_back', 'n_estimators', 'max_features', 'max_depth', 'learning_rate', 'rank', 'weight', 'mae_validation', 'mae_query', 'mae_ensemble', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def _header(self, k, ensemble_weighting): self._report.append( 'Performance of Best Models Separately and as an Ensemble') self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'model', 'n_months_back', 'n_estimators', 'max_features', 'max_depth', 'learning_rate', 'rank', 'weight', 'mae_validation', 'mae_query', 'mae_ensemble', ) self._ct = ColumnsTable(columns=cd, verbose=True)
class ChartBReport(object): def __init__(self, validation_month, k, column_definitions, test): self._report = Report() self._header(validation_month, k) self._column_definitions = column_definitions self._test = test cd = self._column_definitions.defs_for_columns( 'median_absolute_error', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', ) self._ct = ColumnsTable(columns=cd, verbose=True) def _header(self, validation_month, k): def a(line): self._report.append(line) a('MAE for %d best-performing models and their hyperparameters' % k) a('Validation month: %s' % validation_month) a(' ') def append_detail(self, **kwds): # replace NaN with None with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('**TESTING: DISCARD') self._report.write(path)
class ChartFReport(object): def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'mae_index0', 'mae_ensemble', 'mae_best_next_month', 'median_price', 'fraction_median_price_next_month_index0', 'fraction_median_price_next_month_ensemble', 'fraction_median_price_next_month_best', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def _header(self, k, ensemble_weighting): self._report.append( 'Comparison of Errors of Ensemble and Best Model That Know the Future' ) self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
def make_details(data, test_months, n_best, n_worst): "return a ColumnTable" feature_names = Features().ege_names(control.arg.features) columns_table = ColumnsTable( ( ("test_month", 6, "%6s", ("test", "month"), "test month"), ("nth", 2, "%2d", (" ", "n"), "rank of feature (1 ==> more frequently included)"), ("probability", 4, "%4.1f", (" ", "prob"), "probability feature appears in a decision tree"), ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"), ), verbose=True, ) for test_month in test_months: value = data[ReductionKey(test_month)] importances = value.importances["feature_importances"] assert value.importances["features_group"] == control.arg.features, value model = value.model assert type(model) == ResultKeyGbr or type(model) == ResultKeyRfr sorted_indices = importances.argsort() # sorted first lowest, last highest for nth_best in xrange(n_best): if nth_best == len(feature_names): break index = sorted_indices[len(importances) - nth_best - 1] columns_table.append_detail( test_month=test_month, nth=nth_best + 1, probability=importances[index] * 100.0, feature_name=feature_names[index], ) for nth in xrange(n_worst): break # skip, for now if nth == len(feature_names): break nth_worst = n_worst - nth - 1 index = sorted_indices[nth_worst] columns_table.append_detail( test_month=test_month, nth=len(importances) - nth_worst, probability=importances[index] * 100.0, feature_name=feature_names[index], ) if n_best > 1 or n_worst > 1: # insert blank line between test_months if more than 1 row in a month columns_table.append_detail() columns_table.append_legend() return columns_table
def __init__(self, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() cd = self._column_definitions.defs_for_columns( 'validation_month', 'rank', 'median_absolute_error', 'median_price', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', 'alpha', 'l1_ratio', 'units_X', 'units_y', ) self._ct = ColumnsTable(columns=cd, verbose=True) self._header()
def make_table_stats(data, control, in_report_p): 'return Report with statistics for years and months that obey the filter' r = Report() r.append('Prices by Month') r.append('') ct = ColumnsTable(( ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'), ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'), ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'), ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'), ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'), )) prior_mean_price = None prior_median_price = None for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009): for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12): if in_report_p(year, month): selected = data.month == Month(year, month) prices = data[selected].price mean_price = prices.mean() median_price = prices.median() number_trades = len(prices) ct.append_detail( year=year, month=month, mean_price=mean_price, median_price=median_price, mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price, median_price_ratio=None if prior_median_price is None else median_price / prior_median_price, number_trades=number_trades, ) prior_mean_price = mean_price prior_median_price = median_price ct.append_legend() for line in ct.iterlines(): r.append(line) return r
def make_details(data, control): 'return a ColumnsTable' def append_feature_group_description(ct): ct.append_line(' ') ct.append_line('Features groups;') ct.append_line('s : only size features') ct.append_line('sw : only size and wealth features') ct.append_line('swp : only size, wealth, and property features') ct.append_line( 'swpn : all features: size, wealth, property, and neighborhood' ) ct = ColumnsTable( ( ('month', 6, '%6s', ('', 'month'), 'training month'), ('features', 8, '%8s', ('features', 'group'), 'group of features'), ('model', 5, '%5s', ('best', 'model'), 'family of best model'), ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'), ), verbose=True, ) my_info = [] for month in control.test_months: for features in control.feature_groups: mae_model = data[month][features] ct.append_detail( month=month, features=features, model=mae_model.model, mae=mae_model.mae, ) my_info.append( [month, features, mae_model.model, mae_model.mae]) ct.append_detail() # blank line separates each month ct.append_legend() append_feature_group_description(ct) return ct, my_info
def make_column_table(df): ct = ColumnsTable(columns=( ('city', 30, '%30s', ('', 'city'), 'city in Los Angeles Country'), ('count', 6, '%6d', (' ', 'count'), 'number of transactions in 2007'), ('median_price', 7, '%7.0f', ('median', 'price'), 'median price'), ), ) for index, series in df.iterrows(): ct.append_detail( city=series['city'], count=series['count'], median_price=series['median_price'], ) ct.append_legend() return ct
class ChartHReport(object): def __init__(self, k, validation_month, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._report = Report() self._test = test self._header(k, validation_month, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'description', 'mae_validation', 'mae_query', 'mare_validation', 'mare_query', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def preformatted_line(self, line): print line self._ct.append_line(line) def _header(self, k, validation_month, ensemble_weighting): self._report.append( 'Performance of Best Models Separately and as an Ensemble') self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('For validation month %s' % validation_month) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
def make_details(data, test_months): 'return a ColumnTable' columns_table = ColumnsTable(( ('mean_prob', 5, '%5.2f', ('mean', 'prob'), 'mean probability feature appears in a decision tree'), ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'), ), verbose=True) my_prob = [] my_featname = [] mean_importance = make_mean_importance_by_feature(test_months) for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True): columns_table.append_detail( mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name, ) if mean_importance[feature_name] * 100.0 >= 1: my_prob.append(mean_importance[feature_name] * 100.0) my_featname.append(feature_name) columns_table.append_legend() return columns_table, my_featname, my_prob
def make_chart_stats(data, control, filter_f): 'return Report with statistics for years and months that obey the filter' r = Report() r.append('Prices by Month') r.append('') ct = ColumnsTable(( ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'), ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'), ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'), ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'), ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'), )) prior_mean_price = None prior_median_price = None for year in xrange(2003, 2010): for month in xrange(1, 13): if filter_f(year, month): value = data[make_reduction_key(year, month)] mean_price = value['mean'] median_price = value['median'] number_trades = value['count'] ct.append_detail( year=year, month=month, mean_price=mean_price, median_price=median_price, mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price, median_price_ratio=None if prior_median_price is None else median_price / prior_median_price, number_trades=number_trades, ) prior_mean_price = mean_price prior_median_price = median_price ct.append_legend() for line in ct.iterlines(): r.append(line) return r