def main(load=True): indices = ['X', 'Y', 'PdDistrictInt'] train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices] all = sfc.get_data('data/all.csv', drop_data=True) knn = Juristictions() if load: knn.load() else: knn.train(train) knn.save() data = knn.outside_juristiction(all) sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction') return
def make_trimmed(): train = sfc.get_data("data/train.csv") trimmed = trim_categories(train) # Formatting formatter = sfc.DataFormat() formatter.add_columns_enumerate(trimmed) # formatter.add_columns_resolution(trimmed) # Almost certainly not used formatter.add_columns_time(trimmed) # Make the actual datasets make_dataset( trimmed, "data/trim_1e4.csv", size=10000, comment="Random set of training data. " "Selected Categories, 1e4 records", ) make_dataset( trimmed, "data/trim_1e5.csv", size=100000, comment="Random set of training data. " "Selected Categories, 1e5 records", ) make_dataset( trimmed, "data/trim_1e6.csv", size=1000000, comment="Random set of training data. " "Selected Categories, 1e6 records", ) make_dataset(trimmed, "data/trim.csv", comment="Random set of training data. " "Selected Categories, all records") return
def make_full(): train = sfc.get_data("data/train.csv") # Formatting formatter = sfc.DataFormat() formatter.add_columns_enumerate(train) formatter.add_columns_time(train) make_dataset(train, "data/all.csv", comment="All training data") return
def make_full(): train = sfc.get_data('data/train.csv') # Formatting formatter = sfc.DataFormat() formatter.add_columns_enumerate(train) formatter.add_columns_time(train) make_dataset(train, 'data/all.csv', comment='All training data') return
def make_full(): sfc.msg('Make full dataset') train = sfc.get_data('data/train.csv') # Formatting formatter = sfc.DataFormat() train = formatter.add_columns_enumerate(train) train = formatter.add_columns_time(train) train = formatter.add_weather(train) make_dataset(train, 'data/all.csv', comment='All training data') return
def make_test(): sfc.msg('Make test dataset') test = sfc.get_data('data/test.csv') # Formatting formatter = sfc.DataFormat() test = formatter.add_columns_enumerate(test) test = formatter.add_columns_time(test) test = formatter.add_weather(test) make_dataset(test, 'data/test_format.csv', comment='Formatted test data') test = test.sort('Id', ascending=True).reset_index(drop=True) return test
def get(self): '''Pretty much the initialization and setup''' # If filename exists, then load, else make it if not os.path.exists(self.filename): adds = sfc.get_data('data/train.csv').Address.unique() adds = pandas.Series([self.add_code(x) for x in adds]).unique() self.code = dict(zip(adds, range(len(adds)))) with open(self.filename, 'w') as f: pickle.dump(self.code, f) else: with open(self.filename, 'r') as f: self.code = pickle.load(f) return
def plot1(): fig = pl.figure() data = sfc.get_data('data/all.csv') data.Category = data.Category.map(lambda x: x.capitalize()) data_day = data[data.Darkness == 0] data_dark = data[data.Darkness == 2] data_day = data_day.groupby('Category').size() data_dark = data_dark.groupby('Category').size() data_day = data_day.map(lambda x: float(x) / data_day.sum()) data_dark = data_dark.map(lambda x: float(x) / data_dark.sum()) data_day.plot(kind='bar', label='Light', color='b', alpha=0.5) data_dark.plot(kind='bar', label='Dark', color='g', alpha=0.5) fig.subplots_adjust(bottom=0.40) pl.legend() return
def plot1(): fig = pl.figure() data = sfc.get_data("data/all.csv") data.Category = data.Category.map(lambda x: x.capitalize()) data_day = data[data.Darkness == 0] data_dark = data[data.Darkness == 2] data_day = data_day.groupby("Category").size() data_dark = data_dark.groupby("Category").size() data_day = data_day.map(lambda x: float(x) / data_day.sum()) data_dark = data_dark.map(lambda x: float(x) / data_dark.sum()) data_day.plot(kind="bar", label="Light", color="b", alpha=0.5) data_dark.plot(kind="bar", label="Dark", color="g", alpha=0.5) fig.subplots_adjust(bottom=0.40) pl.legend() return
def plot(self, *args): indices = ['Category'] fig = pl.figure() cols = ['b', 'g', 'r', 'm'] for col, (filename, label) in zip(cols, args): data = sfc.get_data(filename)[indices] data.Category = data.Category.map(lambda x: x.capitalize()) hist = data.groupby('Category').size() hist = hist.map(lambda x: float(x)/hist.sum()) hist.plot(kind='bar', color=col, alpha=0.5, label=label) fig.subplots_adjust(bottom=0.40) locs, labels = pl.xticks() #pl.setp(labels, rotation=20) pl.legend() pl.savefig('plots/categories_outside_pd.pdf') pl.close(fig) return
def plot(self, *args): indices = ['Category'] fig = pl.figure() cols = ['b', 'g', 'r', 'm'] for col, (filename, label) in zip(cols, args): data = sfc.get_data(filename)[indices] data.Category = data.Category.map(lambda x: x.capitalize()) hist = data.groupby('Category').size() hist = hist.map(lambda x: float(x) / hist.sum()) hist.plot(kind='bar', color=col, alpha=0.5, label=label) fig.subplots_adjust(bottom=0.40) locs, labels = pl.xticks() #pl.setp(labels, rotation=20) pl.legend() pl.savefig('plots/categories_outside_pd.pdf') pl.close(fig) return
def make_trimmed(): sfc.msg('Make trimmed datasets') train = sfc.get_data('data/train.csv') trimmed = trim_categories(train) # Formatting formatter = sfc.DataFormat() trimmed = formatter.add_columns_enumerate(trimmed) trimmed = formatter.add_columns_time(trimmed) trimmed = formatter.add_weather(trimmed) # Make the actual datasets make_dataset(trimmed, 'data/trim_1e4.csv', size=10000, comment='Random set of training data. ' \ 'Selected Categories, 1e4 records') #make_dataset(trimmed, 'data/trim_1e5.csv', size=100000, #comment='Random set of training data. ' \ #'Selected Categories, 1e5 records') #make_dataset(trimmed, 'data/trim.csv', #comment='Random set of training data. ' \ #'Selected Categories, all records') return
def make_dataset(input, output, comment='', verbose=False, size=None): data = None # Get the input data if isinstance(input, str): if not os.path.exists(input): input2 = os.path.join('data', input) if not os.path.exists(input2): raise IOError('Neither {} nor {} exist'.format(input, input2)) input = input2 data = sfc.get_data(input) elif isinstance(input, pandas.DataFrame): data = copy.deepcopy(input) else: raise IOError('Cannot deal with a {}'.format(type(input))) # Shrink to random records if size is not None and size < len(data): random.seed(sfc._SEED) data = data.ix[sorted(random.sample(xrange(len(data)), size))] data.reset_index(drop=True) sfc.write_data(data, output, comment=comment) return
def add(self, filename): finger = 0 data = sfc.get_data(filename) weather = pandas.read_csv(self.weather, infer_datetime_format=True, parse_dates=['DateTime']) # make both ascending in date weather = weather.sort('DateTime', ascending=True) weather = weather.reset_index(drop=True) data = data.reindex(index=data.index[::-1]) data = data.reset_index(drop=True) new_cols = {k: [None] * len(data) for k in weather.columns} for i, date in enumerate(data.Dates): while abs(weather.DateTime[finger] - date) > abs(weather.DateTime[finger + 1] - date): finger += 1 for k in weather.columns: new_cols[k][i] = weather[k][finger] new_data = data.join(pandas.DataFrame(new_cols)) new_data.reindex(index=new_data.index[::-1]) return new_data.reset_index(drop=True)
def make_dataset(input, output, comment="", verbose=False, size=None): data = None # Get the input data if isinstance(input, str): if not os.path.exists(input): input2 = os.path.join("data", input) if not os.path.exists(input2): raise IOError("Neither {} nor {} exist".format(input, input2)) input = input2 data = sfc.get_data(input) elif isinstance(input, pandas.DataFrame): data = copy.deepcopy(input) else: raise IOError("Cannot deal with a {}".format(type(input))) # Shrink to random records if size is not None and size < len(data): random.seed(sfc._SEED) data = data.ix[sorted(random.sample(xrange(len(data)), size))] data.reset_index(drop=True) sfc.write_data(data, output, comment=comment) return
def make_trimmed(): train = sfc.get_data('data/train.csv') trimmed = trim_categories(train) # Formatting formatter = sfc.DataFormat() formatter.add_columns_enumerate(trimmed) #formatter.add_columns_resolution(trimmed) # Almost certainly not used formatter.add_columns_time(trimmed) # Make the actual datasets make_dataset(trimmed, 'data/trim_1e4.csv', size=10000, comment='Random set of training data. ' \ 'Selected Categories, 1e4 records') make_dataset(trimmed, 'data/trim_1e5.csv', size=100000, comment='Random set of training data. ' \ 'Selected Categories, 1e5 records') make_dataset(trimmed, 'data/trim_1e6.csv', size=1000000, comment='Random set of training data. ' \ 'Selected Categories, 1e6 records') make_dataset(trimmed, 'data/trim.csv', comment='Random set of training data. ' \ 'Selected Categories, all records') return
def add(self, filename): finger = 0 data = sfc.get_data(filename) weather = pandas.read_csv( self.weather, infer_datetime_format=True, parse_dates=['DateTime'] ) # make both ascending in date weather = weather.sort('DateTime', ascending=True) weather = weather.reset_index(drop=True) data = data.reindex(index=data.index[::-1]) data = data.reset_index(drop=True) new_cols = {k: [None] * len(data) for k in weather.columns} for i, date in enumerate(data.Dates): while abs(weather.DateTime[finger] - date) > abs(weather.DateTime[finger + 1] - date): finger += 1 for k in weather.columns: new_cols[k][i] = weather[k][finger] new_data = data.join(pandas.DataFrame(new_cols)) new_data.reindex(index=new_data.index[::-1]) return new_data.reset_index(drop=True)
time_range.append(datetime.time(m / 60, m % 60)) df_out = pandas.concat(all_bins, axis=1).T df_out.index = time_range return df_out def sort_categories_by_frequency(cats): order = sorted(cats, key=lambda x: len(cats[x])) out = OrderedDict() for i in order: out[i] = cats[i] return out ############################################################################### if __name__ == "__main__": df = sfc.get_data('data/trim_1e5.csv', drop_data=True) cats = sfc.data2dict(df, 'Category') #pds = sfc.data2dict(df, 'PdDistrict') #plotter = Plot2D() #plotter.plot(cats) plotter.plot_scatter(pds, 'PDs') pds_theft = { k: v[v.Category == 'VEHICLE THEFT'] for k, v in pds.iteritems() } plotter.plot_scatter(pds_theft, 'PDs_VehicleTheft') ###############################################################################
def main(load=True): indices = ['X', 'Y', 'PdDistrictInt'] train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices] all = sfc.get_data('data/all.csv', drop_data=True) knn = Juristictions() if load: knn.load() else: knn.train(train) knn.save() data = knn.outside_juristiction(all) sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction') return ############################################################################### if __name__ == "__main__": #main(False) knn = Juristictions() knn.load() #knn.plot(('data/outside_pd.csv', 'Outside PD'), ('data/all.csv', 'all')) data = sfc.get_data('data/all.csv') all = knn.add_outside_juristiction(data) sfc.write_data(all, 'data/all.csv') ###############################################################################
bdt_real_2 = AdaBoostClassifier( DecisionTreeClassifier(max_depth=8), n_estimators=10, learning_rate=1 ) #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1, #random_state=6065) bdt_real = BaggingClassifier(base_estimator=bdt_real_2, random_state=6065, n_estimators=100) #bdt_real = RandomForestClassifier(random_state=6065, #n_estimators=200) #bdt_real = ExtraTreesClassifier(random_state=6065, #min_samples_split=5, #n_estimators=200) bdt_real.fit(X_train, y_train) y_predict = pandas.Series(bdt_real.predict(X_test)) print len(y_predict[y_predict == y_test]) print len(y_predict) return bdt_real bdt = train_classifiers(sfc.get_data('data/all.csv')) test = sfc.get_data('data/test.csv')
t1, t2 = bin_times.index[0], bin_times.index[-1] m = (t1.hour*60 + t1.minute + t2.hour*60 + t2.minute) / 2 all_bins.append(bin_times.sum()) time_range.append(datetime.time(m/60, m%60)) df_out = pandas.concat(all_bins, axis=1).T df_out.index = time_range return df_out def sort_categories_by_frequency(cats): order = sorted(cats, key=lambda x: len(cats[x])) out = OrderedDict() for i in order: out[i] = cats[i] return out ############################################################################### if __name__ == "__main__": df = sfc.get_data('data/trim_1e5.csv', drop_data=True) cats = sfc.data2dict(df, 'Category') #pds = sfc.data2dict(df, 'PdDistrict') #plotter = Plot2D() #plotter.plot(cats) plotter.plot_scatter(pds, 'PDs') pds_theft = {k:v[v.Category=='VEHICLE THEFT'] for k, v in pds.iteritems()} plotter.plot_scatter(pds_theft, 'PDs_VehicleTheft') ###############################################################################
############################################################################### def main(load=True): indices = ['X', 'Y', 'PdDistrictInt'] train = sfc.get_data('data/trim_1e4.csv', drop_data=True)[indices] all = sfc.get_data('data/all.csv', drop_data=True) knn = Juristictions() if load: knn.load() else: knn.train(train) knn.save() data = knn.outside_juristiction(all) sfc.write_data(data, 'data/outside_pd.csv', comment='Outside juristiction') return ############################################################################### if __name__ == "__main__": #main(False) knn = Juristictions() knn.load() #knn.plot(('data/outside_pd.csv', 'Outside PD'), ('data/all.csv', 'all')) data = sfc.get_data('data/all.csv') all = knn.add_outside_juristiction(data) sfc.write_data(all, 'data/all.csv') ###############################################################################