def split_data(file, configs): data = helper_functions.load_object(file) data.is_regression = configs.is_regression splitter = DataSplitter() splitData = data_lib.SplitData() splitData.data = data num_splits = 30 perc_train = .8 keep_for_splitting = None if configs.split_data_set_ids is not None: keep_for_splitting = array_functions.false(data.n) keep_for_splitting[data.data_set_ids == 0] = True #Pretend data_set_ids is a label vector to ensure each data set is split equally if data.is_regression and data.data_set_ids is not None: assert len(data.data_set_ids) == data.n is_regression = False splitData.splits = splitter.generate_splits(data.data_set_ids, num_splits, perc_train, is_regression, keep_for_splitting) else: splitData.splits = splitter.generate_splits(data.y, num_splits, perc_train, data.is_regression, keep_for_splitting) splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep split_dir = os.path.dirname(file) save_file = split_dir + '/split_data.pkl' helper_functions.save_object(save_file, splitData) return splitData
def test_mnist(): num_per_class = 50 data = helper_functions.load_object('../data_sets/mnist/raw_data.pkl') classes_to_use = [0, 4, 8, 7] I = array_functions.find_set(data.y, classes_to_use) data = data.get_subset(I) to_keep = None for i in classes_to_use: inds = (data.y == i).nonzero()[0] I = np.random.choice(inds, size=num_per_class, replace=False) if to_keep is None: to_keep = I else: to_keep = np.concatenate((to_keep, I)) data.change_labels([classes_to_use[1], classes_to_use[3]], [classes_to_use[0], classes_to_use[2]]) data.change_labels([classes_to_use[0], classes_to_use[2]], [0, 1]) data_test = data.get_subset(~to_keep) data = data.get_subset(to_keep) label_names = [ str(classes_to_use[0]) + '+' + str(classes_to_use[1]), str(classes_to_use[2]) + '+' + str(classes_to_use[3]), ] #data = add_label_noise_cluster(data, num_neighbors=20) #data = add_label_noise(data, 20) test_methods(data.x, data.y, data_test.x, data_test.y, label_names, mnist=True)
def _load_temp_experiment_file(final_file_name, num_labels): experiment_temp_file = _temp_experiment_file_name(final_file_name, num_labels) if not os.path.isfile(experiment_temp_file): return None if mpi_utility.is_master(): print 'found ' + experiment_temp_file + ' - loading' return helper_functions.load_object(experiment_temp_file)
def _load_temp_split_file(final_file_name, num_labels, split): split_temp_file = _temp_split_file_name(final_file_name, num_labels, split) if not os.path.isfile(split_temp_file): return None if mpi_utility.is_master(): print 'found ' + split_temp_file + ' - loading' return helper_functions.load_object(split_temp_file)
def create_kc_housing(): file = 'kc_housing/processed_data.pkl' x, y = helper_functions.load_object(file) data = data_class.Data(x, y) data.is_regression = True s = kc_housing_file helper_functions.save_object(s, data)
def viz(pc, fig=None, show_histogram=False, show=True): import create_data_set from methods import method source_learner = method.NadarayaWatsonMethod() target_learner = method.NadarayaWatsonMethod() #pc = configs_lib.ProjectConfigs() data = helper_functions.load_object('../' + pc.data_file).data data.set_train() source_data = data.get_transfer_subset(pc.source_labels) source_data.set_target() target_data= data.get_transfer_subset(pc.target_labels) target_data.set_target() source_learner.train_and_test(source_data) target_learner.train_and_test(target_data) source_learner.sigma = 10 target_learner.sigma = 10 x = array_functions.vec_to_2d(np.linspace(data.x.min(), data.x.max(), 100)) test_data = data_lib.Data() test_data.x = x test_data.is_regression = True y_s = source_learner.predict(test_data).fu y_t = target_learner.predict(test_data).fu #array_functions.plot_line(x,y_t-y_s,pc.data_set,y_axes=np.asarray([-5,5])) y = y_t-y_s #y = y - y.mean() array_functions.plot_line(x,y, title=None ,fig=fig,show=show) if show_histogram: array_functions.plot_histogram(data.x,20) x=1
def split_data(file, configs): data = helper_functions.load_object(file) splitter = DataSplitter() splitData = data_lib.SplitData() splitData.data = data num_splits = 30 perc_train = .8 keep_for_splitting = None if configs.split_data_set_ids is not None: keep_for_splitting = array_functions.false(data.n) keep_for_splitting[data.data_set_ids == 0] = True #Pretend data_set_ids is a label vector to ensure each data set is split equally if data.is_regression and data.data_set_ids is not None: assert len(data.data_set_ids) == data.n is_regression = False splitData.splits = splitter.generate_splits( data.data_set_ids, num_splits, perc_train, is_regression, keep_for_splitting ) else: splitData.splits = splitter.generate_splits( data.y, num_splits, perc_train, data.is_regression, keep_for_splitting ) splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep split_dir = os.path.dirname(file) save_file = split_dir + '/split_data.pkl' helper_functions.save_object(save_file,splitData) return splitData
def create_pollution( labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True ): file = "pollution/processed_data.pkl" y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ": " + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(labels_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_xy: data.reset_x() data.normalize_y() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = "pollution-%d-%d" % (series_to_use, num_instances) else: s = "pollution-%d" % series_to_use if normalize_xy: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def _load_temp_experiment_file(final_file_name, num_labels): experiment_temp_file = _temp_experiment_file_name(final_file_name, num_labels) if not os.path.isfile(experiment_temp_file): return None if mpi_utility.is_master(): print 'found ' + experiment_temp_file + ' - loading' return helper_functions.load_object(experiment_temp_file)
def run_main(): #data_dir = 'data_sets/concrete' data_dir = 'data_sets/boston_housing' #data_dir = 'data_sets/kc_housing' #data_dir = 'data_sets/synthetic_linear_reg500-50-1.01' #data_dir = 'data_sets/drosophilia' data_dir = 'data_sets/synthetic_linear_reg500-10-1.01' data_file = data_dir + '/split_data.pkl' data = helper_functions.load_object(data_file).data data.set_target() data.set_train() data.set_true_y() #data.x = array_functions.select_k_features(data.x, data.y, 50) estimator.train_and_test(data) w_normalized = estimator.w / norm(estimator.w) w_normalized = np.expand_dims(w_normalized, 1) print w_normalized p = estimator.w.size corr = np.zeros((p,1)) for i in range(p): xi = data.x[:,i] y = data.true_y corr[i] = pearsonr(xi, y)[0] print corr m = np.concatenate((w_normalized, corr), 1) print m
def _load_temp_split_file(final_file_name, num_labels, split): split_temp_file = _temp_split_file_name(final_file_name, num_labels, split) if not os.path.isfile(split_temp_file): return None if mpi_utility.is_master(): print 'found ' + split_temp_file + ' - loading' return helper_functions.load_object(split_temp_file)
def create_kc_housing(): file = "kc_housing/processed_data.pkl" x, y = helper_functions.load_object(file) data = data_class.Data(x, y) data.is_regression = True s = kc_housing_file helper_functions.save_object(s, data)
def load_data_and_splits(self, data_file): data_and_splits = helper_functions.load_object(data_file) data_and_splits.data.repair_data() assert self.configs.num_splits <= len(data_and_splits.splits) data_and_splits.labels_to_keep = self.configs.labels_to_keep data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample data_and_splits.target_labels = self.configs.target_labels data_and_splits.data.repair_data() return data_and_splits
def create_time_series(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name='CO2_emissions'): file = name + '/processed_data.pkl' all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ': ' + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + '-%s-%d' % (str(series_to_use), num_instances) else: s = name + '-%s' % str(series_to_use) if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_pollution(labels_to_use=np.arange(2), series_to_use=[0], num_instances=None, normalize_xy=True, save_data=True): #series_to_use = 1 file = 'pollution/processed_data.pkl' y, ids = helper_functions.load_object(file) data = None label_names = [] for i in range(y.shape[1]): print str(i) + '-' + ids[i] + ': ' + str(y[0, i, :]) for idx, s in enumerate(series_to_use): for label in labels_to_use: label_names.append(str(label) + '-' + ids[s]) y_to_use = y[:, s, :] print str(s) + ': ' + ids[s] time_series_data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[s]])) time_series_data.is_regression = True time_series_data.keep_series(labels_to_use) time_series_data = time_series_data.get_min_range() time_series_data.smooth_missing() time_series_data.x = time_series_data.x.astype(np.float) if num_instances is not None: time_series_data = time_series_data.get_range([0, num_instances]) if normalize_xy: time_series_data.reset_x() #time_series_data.normalize_y() curr_data = time_series_data.create_data_instance() curr_data.data_set_ids += idx * labels_to_use.size if data is None: data = curr_data else: data.combine(curr_data) data.label_names = label_names #perc_used = data.get_perc_used() if num_instances is not None: s = 'pollution-%s-%s' % (str(series_to_use), str(num_instances)) else: s = 'pollution-%d' % series_to_use if normalize_xy: s += '-norm' s += '/raw_data.pkl' #array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def run_main(results_file): results = helper_functions.load_object(results_file) x_list = [] g_list = [] y_axes = [-4,4] for i, r in enumerate(results.results_list): x, g = aggregate_g(r) x_list.append(x) g_list.append(g) array_functions.plot_line_sub(x_list, g_list, title=results_file, y_axes=y_axes) x = 1
def run_experiments(self): data_file = self.configs.data_file data_and_splits = helper_functions.load_object(data_file) data_and_splits.data.repair_data() assert self.configs.num_splits <= len(data_and_splits.splits) data_and_splits.labels_to_keep = self.configs.labels_to_keep data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample data_and_splits.target_labels = self.configs.target_labels data_and_splits.data.repair_data() results_file = self.configs.results_file comm = mpi_utility.get_comm() if os.path.isfile(results_file): if mpi_utility.is_group_master(): print results_file + ' already exists - skipping' return if mpi_utility.is_group_master(): hostname = helper_functions.get_hostname() print '(' + hostname + ') Running experiments: ' + results_file learner = self.configs.learner learner.run_pre_experiment_setup(data_and_splits) num_labels = len(self.configs.num_labels) num_splits = self.configs.num_splits #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits) method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits) for i, nl in enumerate(self.configs.num_labels): method_results.results_list[i].num_labels = nl split_idx = self.configs.split_idx if split_idx is not None: num_labels_list = list(itertools.product(range(num_labels), [split_idx])) else: num_labels_list = list(itertools.product(range(num_labels), range(num_splits))) shared_args = (self, results_file, data_and_splits, method_results) args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list] if self.configs.use_pool: pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size) all_results = pool.map(_run_experiment, args) else: all_results = [_run_experiment(a) for a in args] for curr_results,s in zip(all_results,num_labels_list): if curr_results is None: continue i_labels, split = s method_results.set(curr_results, i_labels, split) method_results.configs = self.configs if self.configs.should_load_temp_data: helper_functions.save_object(results_file,method_results) for i_labels, split in num_labels_list: num_labels = self.configs.num_labels[i_labels] _delete_temp_split_files(results_file, num_labels, split) _delete_temp_folder(results_file)
def create_spatial_data(dir='climate-month'): file = dir + '/processed_data.pkl' locs, y, ids = helper_functions.load_object(file) y = y.T is_missing_loc = (~np.isfinite(locs)).any(1) locs = locs[~is_missing_loc, :] y = y[~is_missing_loc, :] ids = ids[~is_missing_loc] data = data_class.Data(locs, y) data.multilabel_to_multisource() s = dir + '/raw_data.pkl' helper_functions.save_object(s, data)
def get_sized_results(file_name): file_name_no_suffix = os.path.basename(helper_functions.remove_suffix(file_name, '.pkl')) dir_name = os.path.dirname(file_name) all_files = os.listdir(dir_name) sized_file_name = file_name_no_suffix + '-num_labels=' files = [] results = [] for s in all_files: if sized_file_name in s: files.append(dir_name + '/' + s) results.append(helper_functions.load_object(dir_name + '/' + s)) return results
def create_spatial_data(dir="climate-month"): file = dir + "/processed_data.pkl" locs, y, ids = helper_functions.load_object(file) # y = y.T is_missing_loc = (~np.isfinite(locs)).any(1) locs = locs[~is_missing_loc, :] y = y[~is_missing_loc, :] ids = ids[~is_missing_loc] data = data_class.Data(locs, y) data.multilabel_to_multisource() s = dir + "/raw_data.pkl" helper_functions.save_object(s, data)
def subset_1_per_instance_id(): data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def vis_data(): s = 'data_sets/' + data_file_dir + '/raw_data.pkl' data = helper_functions.load_object(s) x = data.x y = data.y for i in range(data.p): xi = x[:, i] title = 'Feature Names Missing' if data.feature_names is not None: title = data.feature_names[i] array_functions.plot_2d(xi, y, data_set_ids=data.data_set_ids, title=title) pass pass
def subset_1_per_instance_id(): data = helper_functions.load_object( 'data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object( 'data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def create_drosophila(): data = helper_functions.load_object("drosophilia/processed_data.pkl") x, y = data y = np.reshape(y, y.shape[0]) I = np.random.choice(x.shape[0], size=500, replace=False) x = x[I, :] y = y[I] data = data_class.Data() data.x = x data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True helper_functions.save_object(drosophila_file, data)
def run_main(): import create_data_set from methods import method learner = method.NadarayaWatsonMethod() #s = create_data_set.synthetic_step_transfer_file #s = create_data_set.synthetic_delta_linear_file #s = create_data_set.synthetic_step_linear_transfer_file #s = create_data_set.boston_housing_raw_data_file % '-13' #s = create_data_set.concrete_file % '-7' #s = create_data_set.concrete_file % '-feat=0' #s = create_data_set.bike_file % '-feat=1' #s = create_data_set.wine_file % '-small-11' #s = create_data_set.boston_housing_raw_data_file % '' #learner = None data = helper_functions.load_object(s) viz_features(data.x, data.y, data.data_set_ids, learner=learner)
def create_drosophila(): data = helper_functions.load_object('drosophilia/processed_data.pkl') x, y = data y = np.reshape(y, y.shape[0]) I = np.random.choice(x.shape[0], size=500, replace=False) x = x[I, :] y = y[I] data = data_class.Data() data.x = x data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True helper_functions.save_object(drosophila_file, data)
def run_main(): import create_data_set from methods import method learner = method.NadarayaWatsonMethod() #s = create_data_set.synthetic_step_transfer_file #s = create_data_set.synthetic_delta_linear_file #s = create_data_set.synthetic_step_linear_transfer_file #s = create_data_set.boston_housing_raw_data_file % '-13' #s = create_data_set.concrete_file % '-7' #s = create_data_set.concrete_file % '-feat=0' #s = create_data_set.bike_file % '-feat=1' #s = create_data_set.wine_file % '-small-11' #s = create_data_set.boston_housing_raw_data_file % '' #learner = None data = helper_functions.load_object(s) viz_features(data.x,data.y,data.data_set_ids,learner=learner)
def create_time_series( label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions" ): file = name + "/processed_data.pkl" all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ": " + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + "-%s-%d" % (str(series_to_use), num_instances) else: s = name + "-%s" % str(series_to_use) if normalize_x: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_drought(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True): file = 'drought/processed_data.pkl' y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ': ' + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = 'drought-%d-%d' % (series_to_use, num_instances) else: s = 'drought-%d' % series_to_use if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
import numpy as np from utility import helper_functions data = helper_functions.load_object('debug_data.pkl') A = data['A'] S = data['S'] y = data['y'] v = S.dot(y) try: np.linalg.lstsq(A, v) print 'it worked!' except: print 'error caught' pass
self.std = std self.range = range def gen_stats(data): mean = data.true_y.mean() std = data.true_y.std() range = [data.true_y.min(), data.true_y.max()] stats = data_statistics(mean, std, range) return stats def plot_labels(data): n, bins, patches = plt.hist(data.true_y, 50, normed=1, facecolor='green', alpha=0.75) if __name__ == '__main__': dirs = [ synthetic_dir, boston_housing_dir, adience_dir, wine_dir ] titles = [ 'synthetic', 'housing', 'adience', 'wine' ] num_rows = 2 num_cols = math.ceil(len(dirs)/float(num_rows)) stats = [] for i, d in enumerate(dirs): plt.subplot(2,2,i) data = helper_functions.load_object('../' + d + '/split_data.pkl').data stats.append(gen_stats(data)) plot_labels(data) plt.title(titles[i]) plt.show() pass
def create_table(): vis_configs = configs_lib.VisualizationConfigs() viz_params = configs_lib.viz_params n = len(viz_params) if getattr(vis_configs, 'figsize', None): fig = plt.figure(figsize=vis_configs.figsize) else: fig = plt.figure() # fig.suptitle('Results') # num_rows = min(n, configs_lib.max_rows) cell_text = [[np.nan]*len(vis_configs.results_files) for i in range(len(viz_params))] cols = [] rows = [] size_to_vis = vis_configs.size_to_vis baseline_perf = [] all_perf = [] data_names = [] for data_set_idx, curr_viz_params in enumerate(viz_params): vis_configs = configs_lib.VisualizationConfigs(**curr_viz_params) param_text = [] if len(rows) <= data_set_idx: rows.append(vis_configs.results_dir) method_idx = 0 mean_perf = [] # Used for column names if not provided by users data_names.append(vis_configs.title) for file, legend_str in vis_configs.results_files: if len(cols) <= method_idx: cols.append(legend_str) if not os.path.isfile(file): print file + ' doesn''t exist - skipping' #assert False, 'Creating Table doesn''t work with missing files' cell_text[data_set_idx][method_idx] = 'Missing' mean_val = np.nan else: results = helper_functions.load_object(file) sized_results = get_sized_results(file) results = combine_results(results, sized_results) processed_results = results.compute_error_processed(vis_configs.loss_function, normalize_output=True) sizes = results.sizes #assert size_to_vis in sizes #size_idx = array_functions.find_first_element(sizes, size_to_vis) size_idx = 1 # sizes = sizes[0:4] s = legend_str if s is None: s = results.configs.learner.name_string highs = np.asarray(processed_results.means) + np.asarray(processed_results.highs) lows = np.asarray(processed_results.means) - np.asarray(processed_results.lows) mean_val = processed_results.means[size_idx] var = (highs-lows)[size_idx]/2 latex_str = '-' if mean_val < 1000: #latex_str = '%.1f \\pm %.1f' % (mean_val, var) latex_str = '%.3f (%.2f)' % (mean_val, var) #latex_str = '%.2f(%.2f)' % (mean_val, var) cell_text[data_set_idx][method_idx] = latex_str if method_idx == vis_configs.baseline_idx: baseline_perf.append(mean_val) mean_perf.append(mean_val) method_idx += 1 all_perf.append(np.asarray(mean_perf)) #cell_text.append(param_text) relative_improvement = np.zeros((len(all_perf), all_perf[0].size)) # Create table method_names_for_table = vis_configs.method_names_for_table latex_text = '' for method_name in method_names_for_table: latex_text += ' & ' + method_name latex_text += '\\\\ \hline \n' #latex_text = ' & Ours: Linear & Target Only & LLGC & Reweighting & Offset & SMS & Stacking & Ours with Stacking\\\\ \hline \n' # If data names are provided, use them instead of ones in config file if vis_configs.data_names_for_table is not None: data_names = vis_configs.data_names_for_table for row_idx, row_str in enumerate(cell_text): latex_text += data_names[row_idx] + ' & ' for i, cell_str in enumerate(row_str): latex_text += ' $' + str(cell_str) + '$' if i != len(row_str) - 1: latex_text += ' &' latex_text += ' \\\\ \\hline\n' print latex_text # If we don't want the "baseline improvement" row if vis_configs.baseline_idx is not None: for i in range(relative_improvement.shape[0]): #relative_improvement[i, :] = (baseline_perf[i] - all_perf[i]) / baseline_perf[i] relative_improvement[i, :] = (baseline_perf[i] - all_perf[i]) / baseline_perf[i] mean_relative_improvement = '' for ri in relative_improvement.T: v = ri[np.isfinite(ri)].mean() * 100 mean_relative_improvement += ('$%.2f$ & ' % v) print 'relative improvement: ' + mean_relative_improvement fig, axs = plt.subplots() axs.axis('tight') axs.axis('off') the_table = axs.table( cellText=cell_text, rowLabels=rows, colLabels=cols, loc='center' ) the_table.auto_set_font_size(False) the_table.set_fontsize(10) plt.show() print ''
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions from datetime import date from matplotlib import pyplot as pl from data import data as data_lib try: data = helper_functions.load_object('train.pkl') except: file_name = 'train.csv' feat_names, data = create_data_set.load_csv(file_name, True, dtype=np.float, delim=',') data = data.astype(np.float) Y = data[:, 0] X = data[:, 1:] data = {'X': X, 'Y': Y} helper_functions.save_object('train.pkl', data) x = data['X'] x /= 256 y = data['Y'] data = data_lib.Data(x, y) helper_functions.save_object('raw_data.pkl', data) pass
def vis_data(): s = '../data_sets/' + data_file_dir + '/raw_data.pkl' data = helper_functions.load_object(s) x = data.x y = data.y c = MethodConfigs() x_mat = vec_to_matrix(x, y) #self.set_data_set_defaults('taxi3', source_labels=[1], target_labels=[0], is_regression=True) c.source_labels = np.asarray([1]) c.target_labels = np.asarray([0]) c.use_validation = True I_target = (c.target_labels[0] == data.data_set_ids).nonzero()[0] I_to_use = np.random.choice(I_target, 40, replace=False) data.y[I_target] = np.nan data.y[I_to_use] = data.true_y[I_to_use] learner = local_transfer_methods.LocalTransferDeltaNew(c) v = 1 learner.cv_params['sigma_target'] = learner.create_cv_params(-v, v) learner.cv_params['sigma_b'] = learner.create_cv_params(-v, v) learner.cv_params['sigma_alpha'] = learner.create_cv_params(-v, v) #learner.transform = None output = learner.train_and_test(data).prediction fig = plt.figure(0) plt.title('TODO') plt.axis('off') I_target = data.get_transfer_inds(c.target_labels) vals_to_plot = [ np.abs(output.ft - output.true_y)**1, np.abs(output.y_s + output.b - output.true_y)**1, output.alpha, np.abs(output.y - output.true_y)**1, ] titles = [ 'Target Function \nError', 'Adapted Source \nFunction Error', 'Mixture Function \n', 'Final Prediction \nError', ] min_error = min([vals_to_plot[i].min() for i in [0, 1, 3]]) max_error = max([vals_to_plot[i].max() for i in [0, 1, 3]]) print output.b print output.alpha for i, vals in enumerate(vals_to_plot): ax = plt.subplot(1, len(vals_to_plot), i + 1) ax.set_title(titles[i], fontsize=15) #array_functions.plot_heatmap(data.x[I_target], vals, fig=fig, make_subplot=False, sizes=20) #vals -= min_error #vals /= max_error vals -= vals.min() vals /= vals.max() vals_reshaped = np.reshape(vals, (40, 40)) plt.pcolormesh(vals_reshaped, cmap=cm.gray, shading='flat', norm=None) ax.set_xlabel('Latitude') if i == 0: ax.set_ylabel('Longitude') else: ax.set_ylabel('') plt.xticks([], []) plt.yticks([], []) array_functions.move_fig(fig, 1200, 400) #plt.tight_layout(pad=0, h_pad=0, w_pad=0) #plt.wsp plt.subplots_adjust(left=0.05, right=0.95, top=0.8, bottom=0.1) plt.show(block=True) print ''
import active_project_configs as configs_lib from utility import helper_functions import sklearn from sklearn.linear_model import Ridge from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression pc = configs_lib.ProjectConfigs() main_configs = configs_lib.MainConfigs(pc) data_file = '../' + main_configs.data_file data_and_splits = helper_functions.load_object(data_file) learner = main_configs.learner skl = Ridge(normalize=True) x = data_and_splits.data.x y = data_and_splits.data.y select_k_best = SelectKBest(f_regression, 50) x = select_k_best.fit_transform(x, y) skl.fit(x, y) score = skl.score(x, y) print 'R2 Score: ' + str(score) pass
import scipy.io as sio import os import numpy as np from methods import method from data import data as data_lib from data_sets.create_data_split import DataSplitter from copy import deepcopy from loss_functions import loss_function from utility import array_functions from utility import helper_functions from methods import transfer_methods data = helper_functions.load_object('raw_data.pkl') from configs import base_configs data_splitter = DataSplitter() data_splitter.data = data splits = data_splitter.generate_splits(data.y) split_data = data_lib.SplitData(data, splits) use_transfer = True use_regression = False m = base_configs.MethodConfigs() m.use_validation = True if use_transfer: assert not use_regression m.loss_function = loss_function.ZeroOneError() m.cv_loss_function = loss_function.ZeroOneError() transfer_learner = transfer_methods.StackingTransfer(deepcopy(m)) transfer_learner.base_learner = method.SKLLogisticRegression(deepcopy(m))
def load_taxi_data(num_files_to_load=np.inf, num_bins=50, use_alternate=True, return_coords=False): all_files = [ f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f)) ] x = [] y = [] time = [] has_passenger = [] #combined_data_file = 'combined_data.pkl' combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl' if path.exists(combined_data_file): print 'loading combined data...' all_data = helper_functions.load_object(combined_data_file) print 'done loading data' else: for i, file in enumerate(all_files): if i == num_files_to_load: break if i >= 535: break file_data = load_csv(path.join(data_dir, file), has_field_names=False, delim=str(' '))[1] y.append(file_data[:, 0]) x.append(file_data[:, 1]) has_passenger.append(file_data[:, 2]) time.append(file_data[:, 3]) print i all_data = { 'x': x, 'y': y, 'has_passenger': has_passenger, 'time': time } print 'saving combined data...' helper_functions.save_object(combined_data_file, all_data) x = all_data['x'] y = all_data['y'] has_passenger = all_data['has_passenger'] time = all_data['time'] x_all = np.concatenate(x) y_all = np.concatenate(y) time_all = np.concatenate(time) has_passenger_all = np.concatenate(has_passenger) pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all) if just_pickup: x_all = x_all[pickup_inds] y_all = y_all[pickup_inds] has_passenger_all = has_passenger_all[pickup_inds] time_all = time_all[pickup_inds] #x_bounds = [-122.45677419354838, -122.38322580645161] #y_bounds = [37.738054968287521, 37.816543340380548] x_bounds = [-122.48, -122.35] y_bounds = [37.7, 37.84] #x_bounds = [-np.inf, np.inf] #y_bounds = x_bounds is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds) x_all = x_all[is_in_range] y_all = y_all[is_in_range] x_all = quantize_loc(x_all, num_bins) y_all = quantize_loc(y_all, num_bins) time_all = time_all[is_in_range] hours = 9 * np.ones(time_all.shape) get_hour_vec = np.vectorize(get_hour) hours = get_hour_vec(time_all) ''' get_day_vec = np.vectorize(get_day) days = get_day_vec(time_all) ''' has_passenger_all = has_passenger_all[is_in_range] suffix = '3' is_morning = (hours == 9) is_night = (hours == 18) #is_morning = (hours == 6) & (days == 21) #is_night = (hours == 18) & (days == 21) #is_morning = (days == 21) #is_night = (days == 24) if use_alternate: is_morning = (hours >= 5) & (hours <= 12) is_night = (hours >= 17) #is_morning = days == 21 #is_night = days == 24 #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning #is_night = (has_passenger_all == 1) & (days == 21) & is_night #is_morning = (has_passenger_all == 1) & (hours == 6) #is_night = (has_passenger_all == 1) & (hours == 18) suffix = '2' suffix += '-' + str(num_bins) #print np.unique(days) #is_morning = days == 4 #is_night = days == 8 day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning], num_bins) night_locs, night_values = count_cars(x_all[is_night], y_all[is_night], num_bins) if return_coords: day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins) night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds, num_bins) ''' if use_alternate: I = (day_values > 0) | (night_values > 0) I = I & (day_values > 0) & (night_values > 0) else: I = (day_values > 5) | (night_values > 5) I = I & (day_values > 0) & (night_values > 0) relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I] ''' #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False) return day_locs, day_values, night_locs, night_values, suffix
def vis_data(): s = 'data_sets/' + data_file_dir + '/raw_data.pkl' data = helper_functions.load_object(s) x = data.x y = data.y titles = ['', ''] label_idx = [0, 1] if plot_climate: img_path = 'C:/PythonFramework/far_transfer/figures/climate-terrain.png' image = imread(img_path) label_idx = [0, 4] if data_file_dir == 'climate-month': titles = [ 'Max Temperature Gradient: January', 'Max Temperature Gradient: April' ] label_idx = [0, 4] elif data_file_dir == 'irs-income': titles = ['Income', 'Household Size'] elif data_file_dir == 'zillow-traffic': titles = ['Morning Taxi Pickups', 'Housing Prices'] elif data_file_dir == 'kc-housing-spatial-floors': titles = ['House Prices: 1 Floor', 'House Prices: 2 or More Floors'] if plot_features: for i in range(data.p): xi = x[:, i] title = 'Feature Names Missing' if data.feature_names is not None: title = data.feature_names[i] array_functions.plot_2d(xi, y, data_set_ids=data.data_set_ids, title=title) else: for i, title in zip(label_idx, titles): #plt.close() I = data.data_set_ids == i if plot_gradients or plot_values: g, v = estimate_gradients(x, y, I) if plot_values: g = v #g = np.log(g) #g -= g.min() #g += g.max()/10.0 #g /= g.max() if data_file_dir == 'zillow-traffic': if i == 0: pass g -= g.min() g /= g.max() #g **= .5 else: pass g -= g.min() g /= g.max() #g **= .5 else: if i == 0: g -= g.min() g /= g.max() g = np.sqrt(g) else: g -= g.min() g /= g.max() g **= 1 #array_functions.plot_heatmap(g, sizes=dot_sizes, fig=fig, title=title) fig = plt.figure(i) plt.title(title) plt.axis('off') plt.imshow(g) array_functions.move_fig(fig, 750, 400) #plt.show(block=False) else: fig = plt.figure(4) array_functions.plot_heatmap(x[I, :], y[I], sizes=dot_sizes, fig=fig, title=title) if plot_climate: plt.imshow(image, zorder=0, extent=[-90, -78, 33.5, 38]) array_functions.move_fig(fig, 1400, 600) plt.show(block=True) pass
def run_visualization(): vis_configs = configs_lib.VisualizationConfigs() #data_sets = configs_lib.data_sets_for_exps #n = len(data_sets) viz_params = configs_lib.viz_params n = len(viz_params) if getattr(vis_configs, 'figsize', None): fig = plt.figure(figsize=vis_configs.figsize) else: fig = plt.figure() #fig.suptitle('Results') #num_rows = min(n, configs_lib.max_rows) num_rows = min(n, vis_configs.max_rows) num_cols = math.ceil(float(n) / num_rows) markers = [ 's', '*', '>', '^', 'v', 'X', 'P', 'd', '*' ] for config_idx, curr_viz_params in enumerate(viz_params): subplot_idx = config_idx + 1 plt.subplot(num_rows,num_cols,subplot_idx) axis = [0, 1, np.inf, -np.inf] vis_configs = configs_lib.VisualizationConfigs(**curr_viz_params) sizes = None min_x = np.inf max_x = -np.inf marker_idx = -1 is_file_missing = False for file, legend_str in vis_configs.results_files: if not os.path.isfile(file): is_file_missing = True print file + ' doesn''t exist - skipping' assert len(viz_params) == 1 or \ vis_configs.show_legend_on_all or \ vis_configs.show_legend_on_missing_files or \ not vis_configs.crash_on_missing_files, \ 'Just to be safe, crashing because files are missing' continue marker_idx += 1 results = helper_functions.load_object(file) sized_results = get_sized_results(file) sizes_to_plot = vis_configs.sizes_to_use if sizes_to_plot is not None: sizes_to_plot = set(sizes_to_plot) results = combine_results(results, sized_results) to_remove = list() for j, s in enumerate(results.sizes): if sizes_to_plot is not None and s not in sizes_to_plot: to_remove.append(j) for j in reversed(to_remove): del results.results_list[j] #results.results_list = results.results_list[~to_remove] if len(results.sizes) == 0: print file + ' has no results for sizes ' + str(sizes_to_plot) + ', skipping' #plt.plot([1,2,3], [1,2,3], 'go-', label='line 1', linewidth=2) processed_results = results.compute_error_processed( vis_configs.loss_function, vis_configs.results_features, vis_configs.instance_subset, normalize_output=True ) sizes = results.sizes #sizes = sizes[0:4] min_x = min(min_x, sizes.min()) max_x = max(max_x, sizes.max()) s = legend_str if s is None: s = results.configs.learner.name_string print 'Plotting: ' + file print 'Mean Errors: ' + str(processed_results.means) plt.errorbar(sizes, processed_results.means, yerr=[processed_results.lows, processed_results.highs], label=s, marker=markers[marker_idx], markersize=8 ) highs = np.asarray(processed_results.means) + np.asarray(processed_results.highs) lows = np.asarray(processed_results.means) - np.asarray(processed_results.lows) axis[3] = max(axis[3], highs.max() + .2*lows.min()) axis[2] = min(axis[2], .9*lows.min()) if sizes is None: print 'Empty plot - skipping' continue plt.title(vis_configs.title, fontsize=vis_configs.fontsize) axis_range = max_x - min_x axis[1] = max_x + .1*axis_range axis[0] = min_x - .1*axis_range #show_x_label = num_rows == 1 or subplot_idx > (num_rows-1)*num_cols #show_x_label = num_rows == 1 or subplot_idx == 8 #show_x_label = subplot_idx == 9 #show_x_label = subplot_idx == 8 show_x_label = True show_y_label = num_cols == 1 or subplot_idx % num_cols == 1 or vis_configs.always_show_y_label if show_x_label: plt.xlabel(vis_configs.x_axis_string) if show_y_label: plt.ylabel(vis_configs.y_axis_string) #axis[1] *= 2 axis[3] *= 1 ylims = getattr(vis_configs,'ylims',None) if ylims is not None: axis[2] = ylims[0] axis[3] = ylims[1] plt.axis(axis) if config_idx == 2 or vis_configs.show_legend_on_all or len(viz_params) == 1\ or (vis_configs.show_legend_on_missing_files and is_file_missing): plt.legend(loc='upper right', fontsize=vis_configs.fontsize) #fig.tight_layout(rect=[.05,.05,.95,.95]) if getattr(vis_configs,'borders',None): left,right,top,bottom = vis_configs.borders fig.subplots_adjust(left=left,right=right,top=top,bottom=bottom) if vis_configs.use_tight_layout: plt.tight_layout() plt.show() x = 1 '''
def vis_data(): pc = configs_lib.ProjectConfigs(bc.DATA_KC_HOUSING) #pc = configs_lib.ProjectConfigs(bc.DATA_CLIMATE_MONTH) pc.active_method = configs_lib.ACTIVE_CLUSTER_PURITY #pc.active_method = configs_lib.ACTIVE_CLUSTER #pc.active_method = configs_lib.ACTIVE_RANDOM pc.fixed_sigma_x = False pc.no_spectral_kernel = False pc.no_f_x = False pc.active_items_per_iteration = 10 use_oracle_target = False main_configs = configs_lib.MainConfigs(pc) data_file = '../' + main_configs.data_file data_and_splits = helper_functions.load_object(data_file) data = data_and_splits.get_split(0, 0) is_target = data.data_set_ids == main_configs.target_labels[0] is_source = data.data_set_ids == main_configs.source_labels[0] data.reveal_labels(is_source.nonzero()[0]) data.type = data_lib.TYPE_TARGET*np.ones(data.n) data.type[is_source] = data_lib.TYPE_SOURCE x = data.x y = data.y learner = main_configs.learner learner.use_oracle_target = use_oracle_target if pc.active_method == configs_lib.ACTIVE_CLUSTER_PURITY and False: learner.instance_selector.cv_params['sigma_y'] = [1] print 'Experiment: ' + learner.prefix results = learner.train_and_test(data) queried_data = results.results_list[0].queried_idx selected_data = data.get_subset(queried_data) fig = plt.figure(0, figsize=(12, 5)) plt.title('TODO') plt.axis('off') x1 = data.x[:, 0] x1_sel = selected_data.x[:, 0] if data.p == 1: x2 = data.true_y x2_sel = selected_data.true_y else: assert data.p == 2 x2 = data.x[:, 1] x2_sel = selected_data.x[:, 1] plt.subplot(1, 3, 1) plt.scatter(x1[is_target], x2[is_target], c='b', s=10) plt.scatter(x1_sel, x2_sel, c='r', s=20) if data.p == 2: plt.subplot(1, 3, 2) target_data = data.get_subset(is_target) target_data.y = target_data.true_y.copy() nw_method = method.NadarayaWatsonMethod() y_pred = nw_method.train_and_test(target_data).prediction.y means, _, _, _ = binned_statistic_2d(target_data.x[:, 0], target_data.x[:, 1], y_pred, bins=30) #means = means[:, ::-1] #means = means[::-1, :] means[~np.isfinite(means)] = -1 plt.pcolormesh(means, cmap='RdBu') plt.colorbar() plt.subplot(1, 3, 3) source_data = data.get_subset(is_source) source_data.y = source_data.true_y.copy() nw_method = method.NadarayaWatsonMethod() y_pred = nw_method.train_and_test(source_data).prediction.y means, _, _, _ = binned_statistic_2d(source_data.x[:, 0], source_data.x[:, 1], y_pred, bins=30) # means = means[:, ::-1] # means = means[::-1, :] means[~np.isfinite(means)] = -1 plt.pcolormesh(means, cmap='RdBu') plt.colorbar() plt.show()
import active_project_configs as configs_lib from utility import helper_functions import sklearn from sklearn.linear_model import Ridge from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression pc = configs_lib.ProjectConfigs() main_configs = configs_lib.MainConfigs(pc) data_file = '../' + main_configs.data_file data_and_splits = helper_functions.load_object(data_file) learner = main_configs.learner skl = Ridge(normalize=True) x = data_and_splits.data.x y = data_and_splits.data.y select_k_best = SelectKBest(f_regression, 50) x = select_k_best.fit_transform(x,y) skl.fit(x,y) score = skl.score(x,y) print 'R2 Score: ' + str(score) pass