def get_zipcode_locations(): file = '../zipcodes/zipcodes.txt' fields, zipcode_data = create_data_set.load_csv(file, has_field_names=True, dtype=np.float) locs = zipcode_data[:, [2,1]] zip_codes = zipcode_data[:,0].astype(np.int) zipcode_location_map = dict() for z, loc in zip(zip_codes, locs): zipcode_location_map[z] = loc return zipcode_location_map
def get_zipcode_locs(): loc_fields, loc_data = create_data_set.load_csv(file_name_zip_lat_long, dtype='string', return_data_frame=True) zipcode = loc_data.Zipcode.values.astype(np.int) zip_lat = loc_data.Lat.values.astype(np.float) zip_lon = loc_data.Long.values.astype(np.float) zip_loc = np.stack((zip_lon, zip_lat), 1) has_loc = np.isfinite(zip_loc.sum(1)) d = dict(zip(zipcode[has_loc], zip_loc[has_loc, :])) return d
def create_stations(file): feat_names, data = create_data_set.load_csv(file, True, dtype='str', delim=',', num_rows=1000000000) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def get_zipcode_housing(): housing_fields, housing_data = create_data_set.load_csv(file_name_house_size, dtype='string', return_data_frame=True) zipcodes = housing_data.ZIP.values.astype(np.float) totals = housing_data.Total.values.astype(np.float) households = housing_data.values[:,4:].astype(np.float) weight_vec = np.arange(1,8) sums = households.dot(weight_vec) mean_househoulds = sums / totals I = np.isfinite(mean_househoulds) & (totals > 100) d = dict(zip(zipcodes[I], mean_househoulds[I])) return d
def create_stations(file): feat_names, data = create_data_set.load_csv( file, True, dtype='str', delim=',', num_rows=1000000000 ) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def get_zipcode_housing(): housing_fields, housing_data = create_data_set.load_csv( file_name_house_size, dtype='string', return_data_frame=True) zipcodes = housing_data.ZIP.values.astype(np.float) totals = housing_data.Total.values.astype(np.float) households = housing_data.values[:, 4:].astype(np.float) weight_vec = np.arange(1, 8) sums = households.dot(weight_vec) mean_househoulds = sums / totals I = np.isfinite(mean_househoulds) & (totals > 100) d = dict(zip(zipcodes[I], mean_househoulds[I])) return d
def get_zipcode_wages(): income_fields, income_data = create_data_set.load_csv(file_name_income, dtype='string', return_data_frame=True) zipcode = income_data.ZipCode.values.astype(np.float) agi = income_data.AdjustedGrossIncome.values.astype('string') num_returns = income_data.NumberOfReturns.values.astype('string') i = find_first_element(zipcode, 90001) I = np.arange(i, zipcode.shape[0], 8) zipcode = zipcode[I].astype(np.int) agi = agi[I].astype(np.float) num_returns = num_returns[I].astype(np.float) ''' I = agi < 5000000 zipcode = zipcode[I] agi = agi[I] num_returns = num_returns[I] ''' mean_income = agi / num_returns I = num_returns > 50 d = dict(zip(zipcode[I], mean_income[I])) return d
def get_zipcode_wages(): income_fields, income_data = create_data_set.load_csv( file_name_income, dtype='string', return_data_frame=True) zipcode = income_data.ZipCode.values.astype(np.float) agi = income_data.AdjustedGrossIncome.values.astype('string') num_returns = income_data.NumberOfReturns.values.astype('string') i = find_first_element(zipcode, 90001) I = np.arange(i, zipcode.shape[0], 8) zipcode = zipcode[I].astype(np.int) agi = agi[I].astype(np.float) num_returns = num_returns[I].astype(np.float) ''' I = agi < 5000000 zipcode = zipcode[I] agi = agi[I] num_returns = num_returns[I] ''' mean_income = agi / num_returns I = (num_returns > 50) & (mean_income < np.percentile(mean_income, 99.6)) d = dict(zip(zipcode[I], mean_income[I])) return d
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions x_file_name = 'SpecificStages-truth-feats.csv' y_file_name = 'SpecificStages-truth.csv' _, y = create_data_set.load_csv(y_file_name, True, dtype='str', delim='\t') y = y[1:,:] id_to_y = dict((yi[0], int(yi[3])) for yi in y) pass feature_names, feats = create_data_set.load_csv(x_file_name, True, dtype='str', delim=str('\t')) feats = feats[1:,:] ids = feats[:,0] feats = np.asarray(feats, dtype='float') x = feats[:,1:] y = np.zeros((x.shape[0],1)) for idx, i in enumerate(ids): if i in id_to_y: y[idx] = id_to_y[i] else: print 'missing id' y[idx] = -1 data = (x,y) helper_functions.save_object('processed_data.pkl', data)
s = s[0] year = s[:4] month = s[4:6] day = s[6:8] d = date(int(year), int(month), int(day)) return d create_geospatial_data = True split_date = False file_name = 'kc_house_data.csv' save_data = True sampled_size = 1000 feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 suffix = '' if create_geospatial_data: x_feats = ['long', 'lat'] x_feat_inds = array_functions.find_set(feat_names, x_feats) x = data[:, x_feat_inds] x = array_functions.remove_quotes(x) x = x.astype(np.float) x[:, 0] = array_functions.normalize(x[:, 0]) x[:, 1] = array_functions.normalize(x[:, 1])
return d file_names = daily_file_names if use_monthly: file_names = monthly_file_names feats_to_keep = ['STATION', 'STATION_NAME', 'LATITUDE', 'LONGITUDE', 'DATE', 'TAVG', 'TMAX', 'TMIN', 'PRCP'] if use_monthly: feats_to_keep[1] = 'NAME' for i, file in enumerate(file_names): feat_names_curr, data_curr = create_data_set.load_csv( file, True, dtype='str', delim=',', num_rows=1000000000 ) inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str)) curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str) to_remove = array_functions.false(data_curr.shape[0])
def run_main(): import caffe adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\' age_net_pretrained='/age_net.caffemodel' age_net_model_file='/deploy_age.prototxt' age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file, adience_caffe_model_dir + age_net_pretrained, channel_swap=(2,1,0), raw_scale=255, image_dims=(256, 256)) age_list=['(0, 2)','(4, 6)','(8, 12)','(15, 20)','(25, 32)','(38, 43)','(48, 53)','(60, 100)'] adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\' adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv' metadata = create_data_set.load_csv(adience_metadata_file, dtype='string', delim='\t', ) column_names = metadata[0].tolist() photo_data = metadata[1] face_id_col = column_names.index('face_id') user_id_col = column_names.index('user_id') image_name_col = column_names.index('original_image') age_col = column_names.index('age') x = np.zeros((photo_data.shape[0], 512)) y = np.zeros((photo_data.shape[0])) id = np.zeros((photo_data.shape[0])) i = 0 last_perc_done = 0 for idx, row in enumerate(photo_data): perc_done = math.floor(100 * float(idx) / len(photo_data)) if perc_done > last_perc_done: last_perc_done = perc_done print str(perc_done) + '% done' image_dir = adience_image_dir + row[user_id_col] + '/' face_id = row[face_id_col] ''' images_in_dir = os.listdir(image_dir) matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0] assert len(matching_images) < 2 if len(matching_images) == 0: print 'Skipping: ' + image continue ''' image = image_dir + 'landmark_aligned_face.' + str(face_id) + '.' + row[image_name_col] if not os.path.isfile(image): print 'Skipping: ' + image continue input_image = caffe.io.load_image(image) age = row[age_col] blobs = ['fc7'] features_age = predict_blobs(age_net,[input_image],blobs) x[i,:] = features_age y[i] = extract_age(age) id[i] = float(face_id) i += 1 data = data_class.Data() data.x = x data.instance_ids = id data.y = y data.is_regression = True data.set_train() data.set_target() data.set_true_y() data_file = create_data_set.adience_aligned_cnn_file helper_functions.save_object('data_sets/' + data_file, data) print 'TODO'
data.is_regression = True return data def combine_data(x1, y1, x2, y2): x = np.vstack((x1, x2)) y = np.concatenate((y1, y2)) data_set_ids = np.concatenate((np.zeros(y1.size), np.ones(y2.size))) data = data_lib.Data(x, y) data.data_set_ids = data_set_ids data.is_regression return data if use_zipcode_data: file = 'Zip_Zhvi_AllHomes.csv' data_fields, string_data = create_data_set.load_csv(file, has_field_names=True, dtype='string') zip_code = vec_remove_quotations(string_data[:, 1]).astype(np.int) state = vec_remove_quotations(string_data[:, 3]) # year1_idx = array_functions.find_first_element(data_fields, '1996-04') year1_idx = array_functions.find_first_element(data_fields, '2001-01') # year1_idx = array_functions.find_first_element(data_fields, '2016-02') year2_idx = array_functions.find_first_element(data_fields, '2017-02') pricing_data = string_data[:, [year1_idx, year2_idx]] pricing_data = vec_replace(pricing_data).astype(np.float) zipcode_location_map = get_zipcode_locations() locations = np.zeros((zip_code.size, 2)) for i, z in enumerate(zip_code): if z not in zipcode_location_map: print 'missing zipcode: ' + str(z) locations[i, :] = np.nan continue
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions from datetime import date from matplotlib import pyplot as pl from data import data as data_lib try: data = helper_functions.load_object('train.pkl') except: file_name = 'train.csv' feat_names, data = create_data_set.load_csv(file_name, True, dtype=np.float, delim=',') data = data.astype(np.float) Y = data[:, 0] X = data[:, 1:] data = {'X': X, 'Y': Y} helper_functions.save_object('train.pkl', data) x = data['X'] x /= 256 y = data['Y'] data = data_lib.Data(x, y) helper_functions.save_object('raw_data.pkl', data) pass
from utility.array_functions import find_first_element import datetime file_name = 'pollution_us_2000_2016.csv' def to_date(date_str): a = date_str.split('-') year, month, day = [int(s) for s in a] d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = [s + ' Mean' for s in [ 'NO2', 'O3', 'SO2', 'CO', ]] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) to_keep = array_functions.false(data.shape[0]) date_strs = data[:, find_first_element(feat_names, 'Date Local')] prev = ''
def load_taxi_data(num_files_to_load=np.inf, num_bins=50, use_alternate=True, return_coords=False): all_files = [ f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f)) ] x = [] y = [] time = [] has_passenger = [] #combined_data_file = 'combined_data.pkl' combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl' if path.exists(combined_data_file): print 'loading combined data...' all_data = helper_functions.load_object(combined_data_file) print 'done loading data' else: for i, file in enumerate(all_files): if i == num_files_to_load: break if i >= 535: break file_data = load_csv(path.join(data_dir, file), has_field_names=False, delim=str(' '))[1] y.append(file_data[:, 0]) x.append(file_data[:, 1]) has_passenger.append(file_data[:, 2]) time.append(file_data[:, 3]) print i all_data = { 'x': x, 'y': y, 'has_passenger': has_passenger, 'time': time } print 'saving combined data...' helper_functions.save_object(combined_data_file, all_data) x = all_data['x'] y = all_data['y'] has_passenger = all_data['has_passenger'] time = all_data['time'] x_all = np.concatenate(x) y_all = np.concatenate(y) time_all = np.concatenate(time) has_passenger_all = np.concatenate(has_passenger) pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all) if just_pickup: x_all = x_all[pickup_inds] y_all = y_all[pickup_inds] has_passenger_all = has_passenger_all[pickup_inds] time_all = time_all[pickup_inds] #x_bounds = [-122.45677419354838, -122.38322580645161] #y_bounds = [37.738054968287521, 37.816543340380548] x_bounds = [-122.48, -122.35] y_bounds = [37.7, 37.84] #x_bounds = [-np.inf, np.inf] #y_bounds = x_bounds is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds) x_all = x_all[is_in_range] y_all = y_all[is_in_range] x_all = quantize_loc(x_all, num_bins) y_all = quantize_loc(y_all, num_bins) time_all = time_all[is_in_range] hours = 9 * np.ones(time_all.shape) get_hour_vec = np.vectorize(get_hour) hours = get_hour_vec(time_all) ''' get_day_vec = np.vectorize(get_day) days = get_day_vec(time_all) ''' has_passenger_all = has_passenger_all[is_in_range] suffix = '3' is_morning = (hours == 9) is_night = (hours == 18) #is_morning = (hours == 6) & (days == 21) #is_night = (hours == 18) & (days == 21) #is_morning = (days == 21) #is_night = (days == 24) if use_alternate: is_morning = (hours >= 5) & (hours <= 12) is_night = (hours >= 17) #is_morning = days == 21 #is_night = days == 24 #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning #is_night = (has_passenger_all == 1) & (days == 21) & is_night #is_morning = (has_passenger_all == 1) & (hours == 6) #is_night = (has_passenger_all == 1) & (hours == 18) suffix = '2' suffix += '-' + str(num_bins) #print np.unique(days) #is_morning = days == 4 #is_night = days == 8 day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning], num_bins) night_locs, night_values = count_cars(x_all[is_night], y_all[is_night], num_bins) if return_coords: day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins) night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds, num_bins) ''' if use_alternate: I = (day_values > 0) | (night_values > 0) I = I & (day_values > 0) & (night_values > 0) else: I = (day_values > 5) | (night_values > 5) I = I & (day_values > 0) & (night_values > 0) relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I] ''' #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False) return day_locs, day_values, night_locs, night_values, suffix
def run_main(): import caffe adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\' age_net_pretrained = '/age_net.caffemodel' age_net_model_file = '/deploy_age.prototxt' age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file, adience_caffe_model_dir + age_net_pretrained, channel_swap=(2, 1, 0), raw_scale=255, image_dims=(256, 256)) age_list = [ '(0, 2)', '(4, 6)', '(8, 12)', '(15, 20)', '(25, 32)', '(38, 43)', '(48, 53)', '(60, 100)' ] adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\' adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv' metadata = create_data_set.load_csv( adience_metadata_file, dtype='string', delim='\t', ) column_names = metadata[0].tolist() photo_data = metadata[1] face_id_col = column_names.index('face_id') user_id_col = column_names.index('user_id') image_name_col = column_names.index('original_image') age_col = column_names.index('age') x = np.zeros((photo_data.shape[0], 512)) y = np.zeros((photo_data.shape[0])) id = np.zeros((photo_data.shape[0])) i = 0 last_perc_done = 0 for idx, row in enumerate(photo_data): perc_done = math.floor(100 * float(idx) / len(photo_data)) if perc_done > last_perc_done: last_perc_done = perc_done print str(perc_done) + '% done' image_dir = adience_image_dir + row[user_id_col] + '/' face_id = row[face_id_col] ''' images_in_dir = os.listdir(image_dir) matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0] assert len(matching_images) < 2 if len(matching_images) == 0: print 'Skipping: ' + image continue ''' image = image_dir + 'landmark_aligned_face.' + str( face_id) + '.' + row[image_name_col] if not os.path.isfile(image): print 'Skipping: ' + image continue input_image = caffe.io.load_image(image) age = row[age_col] blobs = ['fc7'] features_age = predict_blobs(age_net, [input_image], blobs) x[i, :] = features_age y[i] = extract_age(age) id[i] = float(face_id) i += 1 data = data_class.Data() data.x = x data.instance_ids = id data.y = y data.is_regression = True data.set_train() data.set_target() data.set_true_y() data_file = create_data_set.adience_aligned_cnn_file helper_functions.save_object('data_sets/' + data_file, data) print 'TODO'
import datetime file_name = 'MER_T12_06.csv' def to_date(date_str): #a = date_str.split('-') #year, month, day = [int(s) for s in a] year = int(date_str[:4]) month = int(date_str[4:]) day = 1 d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = ['Value'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'YYYYMM')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) to_keep = array_functions.true(date_strs.shape[0]) for i, date_str in enumerate(date_strs): if date_str[4:] == '13' or data[i, y_inds] == 'Not Available': to_keep[i] = False
file_names = daily_file_names if use_monthly: file_names = monthly_file_names feats_to_keep = [ 'STATION', 'STATION_NAME', 'LATITUDE', 'LONGITUDE', 'DATE', 'TAVG', 'TMAX', 'TMIN', 'PRCP' ] if use_monthly: feats_to_keep[1] = 'NAME' for i, file in enumerate(file_names): feat_names_curr, data_curr = create_data_set.load_csv(file, True, dtype='str', delim=',', num_rows=1000000000) inds_to_use = np.asarray([ j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep ]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique(
import numpy as np import scipy from data_sets import create_data_set from data import data as data_lib from utility import helper_functions file = 'SAheart.data.txt' all_field_names, data = create_data_set.load_csv(file, has_field_names=True,dtype='string',delim=str(',')) data[data == 'Present'] = '1' data[data == 'Absent'] = '0' data = data[:, 1:] data = data.astype(np.float) data = data_lib.Data(data[:, :-1], data[:, -1]) data.set_train() data.set_target() helper_functions.save_object('raw_data.pkl', data) print ''
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions file_name = 'kc_house_data.csv' feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long'] y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name]) x = data[:, ~clear_idx] x = array_functions.remove_quotes(x) x = x.astype(np.float) data = (x,y) helper_functions.save_object('processed_data.pkl', data) pass