예제 #1
0
The original csvs were downloaded from the Institute of Museum and
Library Services website and are joined together here and converted
to a count by zip code
'''

import pandas as pd
import utils

museums_raw1 = pd.read_csv("data/raw_data/MuseumFile2018_File1_Nulls.csv")
museums_raw2 = pd.read_csv("data/raw_data/MuseumFile2018_File2_Nulls.csv")

select_cols = ["GZIP5"]

museums1 = museums_raw1.loc[:, select_cols]
museums2 = museums_raw2.loc[:, select_cols]

museums = museums1.append(museums2)
museums = pd.to_numeric(museums['GZIP5'], errors="coerce")

count = pd.Series(museums.squeeze().values.ravel()).value_counts()
museums_count = pd.DataFrame({
    'zip': count.index,
    'museums_count': count.values
})

museums_csv = utils.compute_density(museums_count)
museums_csv.fillna(0, inplace=True)

museums_csv.to_csv("data/museums_zipcode.csv", index=False)
             "business_admin_support_and_waste_mngmt",
             "Agriculture, Forestry, Fishing and Hunting": \
             "business_agri_forestry_fishing_hunting",
             "Arts, Entertainment, and Recreation": "business_arts_entertain",
             "Construction": "business_construction",
             "Educational Services": "business_schooling_services",
             "Finance and Insurance": "business_finance_and_insurance",
             "Health Care and Social Assistance": "business_hlth_care_social",
             "Industries not classified": "business_unclassified",
             "Information": "business_information",
             "Management of Companies and Enterprises": "business_mngmt",
             "Manufacturing": "business_manufacturing",
             "Mining, Quarrying, and Oil and Gas Extraction": "business_oil",
             "Other Services (except Public Administration)": "business_other",
             "Professional, Scientific, and Technical Services": \
             "business_profess_sci_tech_services",
             "Real Estate and Rental and Leasing": "business_real_estate",
             "Retail Trade": "business_retail",
             "Transportation and Warehousing": "business_transport_warehouse",
             "Utilities": "business_utilities",
             "Wholesale Trade": "business_wholesale_trade"}
biz_count.rename(columns=new_names, inplace=True)

biz_count["zip"] = biz_count["zip"].astype(str)
biz_count.reset_index(drop=True, inplace=True)

biz_count = utils.compute_density(biz_count)
biz_count.fillna(0, inplace=True)

biz_count.to_csv("data/business_count.csv", index=False)
             'housing_owner_occupied',
             'housing_renter_occupied',
             'housing_unoccupied',
             'lfpr',
             'unemployment_rate',
             'lastMove_after_2017',
             'lastMove_2015-2016',
             'lastMove_2010-2014',
             'lastMove_2000-2009',
             'lastMove_1990-1999',
             'lastMove_before_1989']

data = censusdata.download('acs5', 2019,
                           censusdata.censusgeo([('state', '*'),
                                            ('zip code tabulation area', '*')]),
                           var_lst, tabletype='profile')
data.columns = col_names
data = data.add_prefix('census_')

data = data[(data >= 0).all(1)] # only keep rows without any missing data

data[['zip', 'state']] = data.index.to_series().apply(utils.extract_state_and_zip)
data.reset_index(drop=True, inplace=True)

pop_density_data = utils.compute_density(data[['zip', 'census_totalPop']])
pop_density_data.columns = ['zip', 'census_popDensity']
data = data.merge(pop_density_data, how='left', on='zip')
data.drop('census_totalPop', axis=1, inplace=True)

data.to_csv('data/census_data.csv', index=False)
예제 #4
0
def load_datasets(dataset_name, dataset_dir, do_pca, pca_dims, add_bias, remove_mean, density_sigma, interp_sigma):
    print dataset_name

    im_files = None
    explain_files = None
    class_names = None
    explain_interp = None  # for the explanation 1.0 means easy to interpret and 0.0 means hard

    if dataset_name == 'iris':
        iris = datasets.load_iris()
        X = iris.data
        Y = iris.target
    elif dataset_name == 'wine':
        wine = datasets.load_wine()
        X = wine.data
        Y = wine.target
    elif dataset_name == 'breast_cancer':
        bc = datasets.load_breast_cancer()
        X = bc.data
        Y = bc.target
    elif dataset_name == '2d_outlier':
        num_exs = 100
        sig = 0.005
        pt = 0.3
        cls1 = np.random.multivariate_normal([pt, pt], [[sig, 0],[0,sig]], int(num_exs*0.8))
        cls2 = np.random.multivariate_normal([-pt, -pt], [[sig, 0],[0,sig]], int(num_exs*0.8))
        # add "noise"
        cls1n = np.random.multivariate_normal([pt, pt], [[sig*10, 0],[0,sig*10]], int(num_exs*0.2))
        cls2n = np.random.multivariate_normal([-pt, -pt], [[sig*10, 0],[0,sig*10]], int(num_exs*0.2))
        X = np.vstack((cls1, cls1n, cls2, cls2n))
        Y = np.ones(X.shape[0]).astype(np.int)
        Y[:int(num_exs*0.8)+int(num_exs*0.2)] = 0
    elif dataset_name == '3blobs':
        num_exs = 80
        cls1 = np.random.multivariate_normal([1.0, -1.0], [[0.12, 0],[0,0.12]], num_exs)
        cls2 = np.random.multivariate_normal([-1.0, -1.0], [[0.12, 0],[0,0.12]], num_exs)
        cls3 = np.random.multivariate_normal([-1.0, 1.0], [[0.12, 0],[0,0.12]], num_exs)
        X = np.vstack((cls1,cls2, cls3))
        Y = np.ones(X.shape[0]).astype(np.int)
        Y[:num_exs] = 0
    elif dataset_name == 'blobs_2_class':
        X, Y = make_blobs(n_samples=200, centers=2, random_state=0)
    elif dataset_name == 'blobs_3_class':
        X, Y = make_blobs(n_samples=300, centers=3, random_state=0)
    else:
        X, Y, im_files, explain_files, class_names, explain_interp = load_data(dataset_dir, dataset_name, interp_sigma)

    if im_files is None:
        im_files = np.asarray(['']*X.shape[0])
    if explain_files is None:
        explain_files = np.asarray(['']*X.shape[0])
    if class_names is None:
        class_names = np.asarray(['']*np.unique(Y).shape[0])
    if explain_interp is None:
        explain_interp = np.ones(X.shape[0])

    # standardize
    if remove_mean:
        X = X - X.mean(0)
        X = X / X.std(0)

    # do PCA
    if do_pca and X.shape[1] > 2:
        pca = PCA(n_components=2)
        pca.fit(X)
        X = pca.transform(X)
        X = X - X.mean(0)
        X = X / X.std(0)

    # add 1 for bias (intercept) term
    if add_bias:
        X = np.hstack((X, np.ones(X.shape[0])[..., np.newaxis]))

    # balance datasets - same number of examples per class
    X, Y, im_files, explain_files, explain_interp = balance_data(X, Y, im_files, explain_files, explain_interp)

    # train test split
    dataset_train, dataset_test = make_train_test_split(X, Y, im_files, explain_files, class_names, explain_interp)

    # density of points
    dataset_train['X_density'] = ut.compute_density(dataset_train['X'], dataset_train['Y'], density_sigma, True)

    print 'train split'
    print dataset_train['X'].shape[0], 'instances'
    print dataset_train['X'].shape[1], 'features'
    print np.unique(dataset_train['Y']).shape[0], 'classes'

    return dataset_train, dataset_test
'''
This file creates the final dataset for the count of libraries
by zip code. It writes the final dataset to a csv called libraries_zipcode.csv
located in the data folder

The original csv was downloaded from the Institute of Museum and
Library Services website and converted to a count by zip code
'''

import pandas as pd
import utils

library_raw = pd.read_csv("data/raw_data/pls_fy18_outlet_pud18i.csv")

select_cols = ["ZIP"]

libraries = library_raw.loc[:, select_cols]

count = pd.Series(libraries.squeeze().values.ravel()).value_counts()
lib_count = pd.DataFrame({'zip': count.index, 'libraries_count': count.values})

lib_csv = utils.compute_density(lib_count)
lib_csv.fillna(0, inplace=True)

lib_csv.to_csv("data/libraries_zipcode.csv", index=False)