def boston_housing_data(): """Boston Housing dataset. Source : https://archive.ics.uci.edu/ml/datasets/Housing Number of samples : 506 Continuous target variable : MEDV MEDV = Median value of owner-occupied homes in $1000's Dataset Attributes: - 1) CRIM per capita crime rate by town - 2) ZN proportion of residential land zoned for lots over 25,000 sq.ft. - 3) INDUS proportion of non-retail business acres per town - 4) CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - 5) NOX nitric oxides concentration (parts per 10 million) - 6) RM average number of rooms per dwelling - 7) AGE proportion of owner-occupied units built prior to 1940 - 8) DIS weighted distances to five Boston employment centres - 9) RAD index of accessibility to radial highways - 10) TAX full-value property-tax rate per $10,000 - 11) PTRATIO pupil-teacher ratio by town - 12) B 1000(Bk - 0.63)^2 where Bk is the prop. of b. by town - 13) LSTAT % lower status of the population Returns -------- X, y : [n_samples, n_features], [n_class_labels] X is the feature matrix with 506 housing samples as rows and 13 feature columns. y is a 1-dimensional array of the continuous target variable MEDV Examples ----------- For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/data/boston_housing_data/ """ data = read_csv(filename2path("boston-housing.csv"), index_col=0) return df2Xy(data, 'medv')
def kaggle_gmsc_data_nomissing(): data = read_csv(filename2path("give-me-some-credit"), index_col=0) data.dropna(inplace=True) return df2Xy(data, 'SeriousDlqin2yrs')
# du -h train_full_raw.csv # wc -l train_full_raw.csv # ``` # + [markdown] slideshow={"slide_type": "slide"} # We'll go through this data by "chunks". Let's set the size of a chunk (can be accessed by papermill): # + tags=["parameters"] CHUNK_SIZE = 1000 # small value for fast execution; can bump to 10000 # - # Let's create a chunk "reader": from mlxtend.utils.data import filename2path from pandas import read_csv TRAIN_FILE = filename2path("avazu") HEADER = ['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id'\ ,'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21'] reader = read_csv(TRAIN_FILE, chunksize=CHUNK_SIZE, names=HEADER, header=0, index_col=0) reader # + [markdown] slideshow={"slide_type": "slide"} # There are about 4,000 chunks of size 10,000 in this data. # # Let's look at the first chunk: # -
def kaggle_house_prices(): data = read_csv(filename2path("house-prices"), index_col=0) data.dropna(inplace=True) return df2Xy(data)
def hotel_reviews_data(): data = read_csv(filename2path("hotel-reviews.csv")) X = data.text.values y = data.label.values return X, y