def create_diabetes(): diabetes_data = datasets.load_diabetes() x = diabetes_data.data y = diabetes_data.target for i in range(x.shape[1]): xi = array_functions.normalize(x[:, i]) yi = array_functions.normalize(y) array_functions.plot_2d(xi, yi) pass assert False
def create_digits(): digits_data = datasets.load_digits() x = digits_data.data y = digits_data.target for i in range(x.shape[1]): xi = array_functions.normalize(x[:, i]) yi = y array_functions.plot_2d(xi, yi, alpha=0.01) pass pass
def create_digits(): digits_data = datasets.load_digits() x = digits_data.data y = digits_data.target for i in range(x.shape[1]): xi = array_functions.normalize(x[:, i]) yi = y array_functions.plot_2d(xi, yi, alpha=.01) pass pass
def create_bike_sharing(): file = "bike_sharing/day.csv" columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype="string") all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns) domain_ind = used_field_names == "yr" domain_ids = np.squeeze(bike_data[:, domain_ind]) # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') # bike_data = bike_data[:,inds_to_keep] # used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: # learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ("-feat=" + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_bike_sharing(): file = 'bike_sharing/day.csv' columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype='string') all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns) domain_ind = used_field_names == 'yr' domain_ids = np.squeeze(bike_data[:, domain_ind]) #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') #bike_data = bike_data[:,inds_to_keep] #used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: #learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ('-feat=' + str(field_to_use)) helper_functions.save_object(s, data) pass
True, dtype='str', delim=',') y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 suffix = '' if create_geospatial_data: x_feats = ['long', 'lat'] x_feat_inds = array_functions.find_set(feat_names, x_feats) x = data[:, x_feat_inds] x = array_functions.remove_quotes(x) x = x.astype(np.float) x[:, 0] = array_functions.normalize(x[:, 0]) x[:, 1] = array_functions.normalize(x[:, 1]) I = array_functions.is_in_percentile(x[:, 0], .01, .99) I &= array_functions.is_in_percentile(x[:, 1], .01, .99) x = x[I, :] y = y[I] data = data[I, :] if split_date: dates = array_functions.remove_quotes(data[:, feat_names == 'date']) date_objs = [] for d in dates: date_obj = get_date(d) date_objs.append(date_obj) min_date = min(date_objs) day_deltas = np.zeros(len(date_objs))
def predict(self, data): # d = data_lib.Data(np.expand_dims(data.source_y_pred, 1), data.y) y_pred_source = data.source_y_pred I = np.arange(y_pred_source.size) if self.predict_sample is not None and self.predict_sample < y_pred_source.size: I = np.random.choice(y_pred_source.size, self.predict_sample, replace=False) if self.use_rbf: #L = array_functions.make_laplacian(y_pred_source[I], self.sigma_tr) W_source_pred = array_functions.make_rbf(y_pred_source[I], self.sigma_tr) if self.oracle_guidance is not None: y = data.true_y[I] n_y = y.size num_to_sample = math.ceil(self.oracle_guidance * n_y**2) rand_index1 = np.random.choice(n_y, int(num_to_sample), replace=True) rand_index2 = np.random.choice(n_y, int(num_to_sample), replace=True) if self.oracle_guidance_binary: target_distances = array_functions.make_graph_distance(y) distance_threshold = .2 * (y.max() - y.min()) W_source_pred[rand_index1, rand_index2] = target_distances[ rand_index1, rand_index2] <= distance_threshold W_source_pred[rand_index2, rand_index1] = target_distances[ rand_index2, rand_index1] <= distance_threshold else: y_scaled = array_functions.normalize(y) * ( y_pred_source.max() - y_pred_source.min()) W_oracle_pred = array_functions.make_rbf( y_scaled, self.sigma_tr) W_source_pred[rand_index1, rand_index2] = W_oracle_pred[rand_index1, rand_index2] W_source_pred[rand_index2, rand_index1] = W_oracle_pred[rand_index2, rand_index1] W = array_functions.make_rbf(self.transform.transform(self.x), self.sigma_nw, x2=self.transform.transform( data.x[I, :])).T else: assert self.oracle_guidance is None k_L = int(self.sigma_tr * I.size) #L = array_functions.make_laplacian_kNN(y_pred_source[I], k_L) W_source_pred = array_functions.make_knn(y_pred_source[I], k_L) k_W = int(self.sigma_nw * self.x.shape[0]) W = array_functions.make_knn(self.transform.transform( data.x[I, :]), k_W, x2=self.transform.transform(self.x)) sparsify_prediction_graph = False if self.use_prediction_graph_radius: sparsify_prediction_graph = True W_sparse = array_functions.make_graph_radius( self.transform.transform(data.x[I, :]), radius=self.radius, ) if self.use_prediction_graph_sparsification: sparsify_prediction_graph = True W_sparse = array_functions.make_knn(self.transform.transform( data.x[I, :]), self.k_sparsification, normalize_entries=False) #W_L = array_functions.make_knn(y_pred_source[I], k_L) if sparsify_prediction_graph: W_source_pred = W_source_pred * W_sparse S = array_functions.make_smoothing_matrix(W) timing_test = False C = self.C * self.x.shape[0] / W_source_pred[:].sum() if self.nystrom_percentage > 0 or timing_test: if timing_test: tic() Sy = S.dot(self.y) if C != 0: lamb = 1 / float(C) f = None tic() inv_approx, _ = array_functions.nystrom_woodbury_laplacian( W_source_pred, lamb, self.nystrom_percentage) self.predict_time = toc() #_, f2 = array_functions.nystrom_woodbury_laplacian(W_source_pred, lamb, self.nystrom_percentage, v=Sy) if f is not None: f *= lamb else: inv_approx *= lamb f = inv_approx.dot(Sy) else: f = Sy if timing_test: toc() if self.nystrom_percentage == 0 or self.nystrom_percentage is None or timing_test: if timing_test: tic() L = array_functions.make_laplacian_with_W(W_source_pred, normalized=False) A = np.eye(I.size) + C * L try: tic() f = np.linalg.lstsq(A, S.dot(self.y))[0] self.predict_time = toc() except: print 'GraphTransferNW:predict failed, returning mean' f = self.y.mean() * np.ones(data.true_y.shape) if timing_test: toc() if timing_test: A_inv = np.linalg.inv(A) print 'approx error: ' + str( norm(inv_approx - A_inv) / norm(A_inv)) o = results.Output(data) if self.predict_sample is not None: nw_data = data_lib.Data(data.x[I, :], f) self.nw_learner.train_and_test(nw_data) nw_output = self.nw_learner.predict(data) o.y = nw_output.y o.fu = nw_output.y else: o.y = f o.fu = f return o
def to_date(date_str): a = date_str.split(' ')[0] a = a.split('/') month, day, year = [int(s) for s in a] d = datetime.date(year, month, day) return d locs, y, ids = create_data_set.load_trip_data([file_name_apr, file_name_sep], None, 'Date/Time', np.asarray(['Lon', 'Lat']), [100, 100], plot_data=True) y[:, 0] /= y[:, 0].max() y[:, 1] /= y[:, 1].max() locs[:, 0] = array_functions.normalize(locs[:, 0]) locs[:, 1] = array_functions.normalize(locs[:, 1]) I = (y.sum(1) > 0) locs = locs[I, :] y = y[I, :] ids = ids[I] data = (locs, y, ids) helper_functions.save_object('processed_data.pkl', data) pass
zipcodes.intersection_update(zipcode_locs.keys()) zipcodes.intersection_update(zipcode_housing.keys()) zipcode_array = np.zeros(len(zipcodes)) income_array = np.zeros(len(zipcodes)) locs = np.zeros((len(zipcodes), 2)) households = np.zeros(len(zipcodes)) for i, key in enumerate(zipcodes): zipcode_array[i] = key income_array[i] = zipcode_income[key] locs[i] = zipcode_locs[key] households[i] = zipcode_housing[key] income_array = np.log(income_array) income_array = array_functions.normalize(income_array) households = array_functions.normalize(households) locs[:, 0] = array_functions.normalize(locs[:, 0]) locs[:, 1] = array_functions.normalize(locs[:, 1]) #array_functions.plot_heatmap(locs, 10*income_array, sizes=50) #array_functions.plot_heatmap(locs, households, sizes=50) y = np.stack((income_array, households), 1) print 'Num Used: ' + str(y.shape[0]) array_functions.plot_heatmap(locs, y, sizes=100, share_axis=True) I = np.random.choice(y.shape[0], 400, replace=False) data = (locs[I, :], y[I], zipcode_array[I]) helper_functions.save_object('processed_data.pkl', data)
zipcodes.intersection_update(zipcode_locs.keys()) zipcodes.intersection_update(zipcode_housing.keys()) zipcode_array = np.zeros(len(zipcodes)) income_array = np.zeros(len(zipcodes)) locs = np.zeros((len(zipcodes), 2)) households = np.zeros(len(zipcodes)) for i, key in enumerate(zipcodes): zipcode_array[i] = key income_array[i] = zipcode_income[key] locs[i] = zipcode_locs[key] households[i] = zipcode_housing[key] income_array = np.log(income_array) income_array = array_functions.normalize(income_array) households = array_functions.normalize(households) locs[:,0] = array_functions.normalize(locs[:,0]) locs[:,1] = array_functions.normalize(locs[:,1]) #array_functions.plot_heatmap(locs, 10*income_array, sizes=50) #array_functions.plot_heatmap(locs, households, sizes=50) y = np.stack((income_array, households), 1) print 'Num Used: ' + str(y.shape[0]) array_functions.plot_heatmap( locs, y, sizes=100, share_axis=True
target_errors[split_idx] = loss.compute_score(target_results) source_errors[split_idx] = loss.compute_score(source_results) stacking_errors[split_idx] = loss.compute_score(stacking_results) errors[state_idx, 0] = target_errors.mean() errors[state_idx, 1] = source_errors.mean() errors[state_idx, 2] = stacking_errors.mean() target_relative = (source_errors.mean() - target_errors.mean())/target_errors.mean() source_relative = (stacking_errors.mean() - target_errors.mean()) / stacking_errors.mean() print 'Target ' + s + ': ' + str(target_relative) print 'Stacking ' + s + ': ' + str(source_relative) if source_relative > target_relative: print '!!!' exit() if not viz: for i in range(2): data.x[:,i] = array_functions.normalize(data.x[:,i]) print 'n: ' + str(data.n) if viz: I1 = data.data_set_ids == 0 I2 = data.data_set_ids == 1 fig1 = pl.figure(1) array_functions.plot_heatmap(data.x[I1, :], data.y[I1], sizes=30, alpha=1, subtract_min=True, fig=fig1) pl.xlabel('Longitude') pl.ylabel('Latitude') pl.title('Taxi Pickups') #pl.xticks([], []) #pl.yticks([], []) fig2 = pl.figure(2) array_functions.plot_heatmap(data.x[I2, :], data.y[I2], sizes=30, alpha=1, subtract_min=True, fig=fig2) pl.xlabel('Longitude') pl.ylabel('Latitude')