The original csvs were downloaded from the Institute of Museum and Library Services website and are joined together here and converted to a count by zip code ''' import pandas as pd import utils museums_raw1 = pd.read_csv("data/raw_data/MuseumFile2018_File1_Nulls.csv") museums_raw2 = pd.read_csv("data/raw_data/MuseumFile2018_File2_Nulls.csv") select_cols = ["GZIP5"] museums1 = museums_raw1.loc[:, select_cols] museums2 = museums_raw2.loc[:, select_cols] museums = museums1.append(museums2) museums = pd.to_numeric(museums['GZIP5'], errors="coerce") count = pd.Series(museums.squeeze().values.ravel()).value_counts() museums_count = pd.DataFrame({ 'zip': count.index, 'museums_count': count.values }) museums_csv = utils.compute_density(museums_count) museums_csv.fillna(0, inplace=True) museums_csv.to_csv("data/museums_zipcode.csv", index=False)
"business_admin_support_and_waste_mngmt", "Agriculture, Forestry, Fishing and Hunting": \ "business_agri_forestry_fishing_hunting", "Arts, Entertainment, and Recreation": "business_arts_entertain", "Construction": "business_construction", "Educational Services": "business_schooling_services", "Finance and Insurance": "business_finance_and_insurance", "Health Care and Social Assistance": "business_hlth_care_social", "Industries not classified": "business_unclassified", "Information": "business_information", "Management of Companies and Enterprises": "business_mngmt", "Manufacturing": "business_manufacturing", "Mining, Quarrying, and Oil and Gas Extraction": "business_oil", "Other Services (except Public Administration)": "business_other", "Professional, Scientific, and Technical Services": \ "business_profess_sci_tech_services", "Real Estate and Rental and Leasing": "business_real_estate", "Retail Trade": "business_retail", "Transportation and Warehousing": "business_transport_warehouse", "Utilities": "business_utilities", "Wholesale Trade": "business_wholesale_trade"} biz_count.rename(columns=new_names, inplace=True) biz_count["zip"] = biz_count["zip"].astype(str) biz_count.reset_index(drop=True, inplace=True) biz_count = utils.compute_density(biz_count) biz_count.fillna(0, inplace=True) biz_count.to_csv("data/business_count.csv", index=False)
'housing_owner_occupied', 'housing_renter_occupied', 'housing_unoccupied', 'lfpr', 'unemployment_rate', 'lastMove_after_2017', 'lastMove_2015-2016', 'lastMove_2010-2014', 'lastMove_2000-2009', 'lastMove_1990-1999', 'lastMove_before_1989'] data = censusdata.download('acs5', 2019, censusdata.censusgeo([('state', '*'), ('zip code tabulation area', '*')]), var_lst, tabletype='profile') data.columns = col_names data = data.add_prefix('census_') data = data[(data >= 0).all(1)] # only keep rows without any missing data data[['zip', 'state']] = data.index.to_series().apply(utils.extract_state_and_zip) data.reset_index(drop=True, inplace=True) pop_density_data = utils.compute_density(data[['zip', 'census_totalPop']]) pop_density_data.columns = ['zip', 'census_popDensity'] data = data.merge(pop_density_data, how='left', on='zip') data.drop('census_totalPop', axis=1, inplace=True) data.to_csv('data/census_data.csv', index=False)
def load_datasets(dataset_name, dataset_dir, do_pca, pca_dims, add_bias, remove_mean, density_sigma, interp_sigma): print dataset_name im_files = None explain_files = None class_names = None explain_interp = None # for the explanation 1.0 means easy to interpret and 0.0 means hard if dataset_name == 'iris': iris = datasets.load_iris() X = iris.data Y = iris.target elif dataset_name == 'wine': wine = datasets.load_wine() X = wine.data Y = wine.target elif dataset_name == 'breast_cancer': bc = datasets.load_breast_cancer() X = bc.data Y = bc.target elif dataset_name == '2d_outlier': num_exs = 100 sig = 0.005 pt = 0.3 cls1 = np.random.multivariate_normal([pt, pt], [[sig, 0],[0,sig]], int(num_exs*0.8)) cls2 = np.random.multivariate_normal([-pt, -pt], [[sig, 0],[0,sig]], int(num_exs*0.8)) # add "noise" cls1n = np.random.multivariate_normal([pt, pt], [[sig*10, 0],[0,sig*10]], int(num_exs*0.2)) cls2n = np.random.multivariate_normal([-pt, -pt], [[sig*10, 0],[0,sig*10]], int(num_exs*0.2)) X = np.vstack((cls1, cls1n, cls2, cls2n)) Y = np.ones(X.shape[0]).astype(np.int) Y[:int(num_exs*0.8)+int(num_exs*0.2)] = 0 elif dataset_name == '3blobs': num_exs = 80 cls1 = np.random.multivariate_normal([1.0, -1.0], [[0.12, 0],[0,0.12]], num_exs) cls2 = np.random.multivariate_normal([-1.0, -1.0], [[0.12, 0],[0,0.12]], num_exs) cls3 = np.random.multivariate_normal([-1.0, 1.0], [[0.12, 0],[0,0.12]], num_exs) X = np.vstack((cls1,cls2, cls3)) Y = np.ones(X.shape[0]).astype(np.int) Y[:num_exs] = 0 elif dataset_name == 'blobs_2_class': X, Y = make_blobs(n_samples=200, centers=2, random_state=0) elif dataset_name == 'blobs_3_class': X, Y = make_blobs(n_samples=300, centers=3, random_state=0) else: X, Y, im_files, explain_files, class_names, explain_interp = load_data(dataset_dir, dataset_name, interp_sigma) if im_files is None: im_files = np.asarray(['']*X.shape[0]) if explain_files is None: explain_files = np.asarray(['']*X.shape[0]) if class_names is None: class_names = np.asarray(['']*np.unique(Y).shape[0]) if explain_interp is None: explain_interp = np.ones(X.shape[0]) # standardize if remove_mean: X = X - X.mean(0) X = X / X.std(0) # do PCA if do_pca and X.shape[1] > 2: pca = PCA(n_components=2) pca.fit(X) X = pca.transform(X) X = X - X.mean(0) X = X / X.std(0) # add 1 for bias (intercept) term if add_bias: X = np.hstack((X, np.ones(X.shape[0])[..., np.newaxis])) # balance datasets - same number of examples per class X, Y, im_files, explain_files, explain_interp = balance_data(X, Y, im_files, explain_files, explain_interp) # train test split dataset_train, dataset_test = make_train_test_split(X, Y, im_files, explain_files, class_names, explain_interp) # density of points dataset_train['X_density'] = ut.compute_density(dataset_train['X'], dataset_train['Y'], density_sigma, True) print 'train split' print dataset_train['X'].shape[0], 'instances' print dataset_train['X'].shape[1], 'features' print np.unique(dataset_train['Y']).shape[0], 'classes' return dataset_train, dataset_test
''' This file creates the final dataset for the count of libraries by zip code. It writes the final dataset to a csv called libraries_zipcode.csv located in the data folder The original csv was downloaded from the Institute of Museum and Library Services website and converted to a count by zip code ''' import pandas as pd import utils library_raw = pd.read_csv("data/raw_data/pls_fy18_outlet_pud18i.csv") select_cols = ["ZIP"] libraries = library_raw.loc[:, select_cols] count = pd.Series(libraries.squeeze().values.ravel()).value_counts() lib_count = pd.DataFrame({'zip': count.index, 'libraries_count': count.values}) lib_csv = utils.compute_density(lib_count) lib_csv.fillna(0, inplace=True) lib_csv.to_csv("data/libraries_zipcode.csv", index=False)