def get_predictions(self, target_data): assert target_data.is_regression o = self.source_learner.predict(target_data) is_labeled = target_data.is_labeled y_s = array_functions.vec_to_2d(o.fu[is_labeled]) y_true = array_functions.vec_to_2d(o.true_y[is_labeled]) return (y_s, y_true)
def get_predictions(self, target_data): ''' o = self.target_learner.predict_loo(target_data) o_source = self.source_learner.predict(target_data) is_labeled = target_data.is_labeled target_labels = self.configs.target_labels if self.use_estimated_f: o = self.target_learner.predict_loo(target_data.get_subset(is_labeled)) if target_data.is_regression: y_t = array_functions.vec_to_2d(o.fu) y_s = array_functions.vec_to_2d(o_source.fu[is_labeled]) y_true = array_functions.vec_to_2d(o.true_y) else: y_t = o.fu[:,target_labels] y_s = o_source.fu[:,target_labels] y_s = y_s[is_labeled,:] y_true = array_functions.make_label_matrix(o.true_y)[:,target_labels] y_true = array_functions.try_toarray(y_true) return (y_t, y_s, y_true) ''' assert target_data.is_regression o = self.source_learner.predict(target_data) is_labeled = target_data.is_labeled y_s = array_functions.vec_to_2d(o.fu[is_labeled]) y_true = array_functions.vec_to_2d(o.true_y[is_labeled]) return (y_s, y_true)
def combine_predictions(self,x,y_source,y_target): data = data_lib.Data() data.x = x data.is_regression = True g = self.g_nw.predict(data).fu a_t = 1 / (1 + g) b_s = g / (1 + g) if y_source.ndim > 1: a_t = array_functions.vec_to_2d(a_t) b_s = array_functions.vec_to_2d(b_s) fu = a_t*y_target + b_s * (y_source + self.bias) else: fu = np.multiply(a_t, y_target) + np.multiply(b_s, y_source + self.bias) return fu
def combine_predictions(self, x, y_source, y_target): data = data_lib.Data() data.x = x data.is_regression = True g = self.g_nw.predict(data).fu a_t = 1 / (1 + g) b_s = g / (1 + g) if y_source.ndim > 1: a_t = array_functions.vec_to_2d(a_t) b_s = array_functions.vec_to_2d(b_s) fu = a_t * y_target + b_s * (y_source + self.bias) else: fu = np.multiply(a_t, y_target) + np.multiply( b_s, y_source + self.bias) return fu
def viz_features(x, y, domain_ids, feature_names=None, alpha=.1, learner=None): #y = array_functions.normalize(y) x = array_functions.vec_to_2d(x) for i in range(x.shape[1]): xi = x[:, i] xi_train = xi yi = y ids_i = domain_ids title = str(i) density = None if feature_names is not None: title = str(i) + ': ' + feature_names[i] if learner is not None: xi, yi, ids_i, density = train_on_data(xi, yi, domain_ids, learner) density = density * 100 + 1 I = array_functions.is_invalid(density) density[I] = 200 alpha = 1 array_functions.plot_2d_sub(xi, yi, alpha=alpha, title=title, data_set_ids=ids_i, sizes=density) k = 1 array_functions.plot_histogram(xi_train, 100) k = 1
def train(self, data): x = data.x x = array_functions.vec_to_2d(x) self.model = sm.nonparametric.KDEMultivariate(x, var_type='c' * x.shape[1], bw='cv_ls')
def train(self, data): I = data.is_train & data.is_labeled x = data.x[I,:] n = x.shape[0] p = x.shape[1] y = data.y[I] x_labeled_transform = self.transform.fit_transform(x) x_all_transform = self.transform.transform(data.x) x_bias = np.hstack((x_labeled_transform,np.ones((n,1)))) x_all_bias = np.hstack((x_all_transform,np.ones((x_all_transform.shape[0],1)))) O = np.eye(p+1) O[p,p] = 0 x_L = x_all_bias if x_L.shape[0] > self.max_n_L: I_L = np.random.choice(x_L.shape[0], self.max_n_L, replace = False) x_L = x_L[I_L,:] L = self.create_laplacian(x_L) XX = x_bias.T.dot(x_bias) XLX = x_L.T.dot(L).dot(x_L) A = XX + self.C*O + self.C2*XLX v = np.linalg.lstsq(A,x_bias.T.dot(y)) w_anal = array_functions.vec_to_2d(v[0][0:p]) b_anal = v[0][p] self.w = w_anal self.b = b_anal
def viz(pc, fig=None, show_histogram=False, show=True): import create_data_set from methods import method source_learner = method.NadarayaWatsonMethod() target_learner = method.NadarayaWatsonMethod() #pc = configs_lib.ProjectConfigs() data = helper_functions.load_object('../' + pc.data_file).data data.set_train() source_data = data.get_transfer_subset(pc.source_labels) source_data.set_target() target_data= data.get_transfer_subset(pc.target_labels) target_data.set_target() source_learner.train_and_test(source_data) target_learner.train_and_test(target_data) source_learner.sigma = 10 target_learner.sigma = 10 x = array_functions.vec_to_2d(np.linspace(data.x.min(), data.x.max(), 100)) test_data = data_lib.Data() test_data.x = x test_data.is_regression = True y_s = source_learner.predict(test_data).fu y_t = target_learner.predict(test_data).fu #array_functions.plot_line(x,y_t-y_s,pc.data_set,y_axes=np.asarray([-5,5])) y = y_t-y_s #y = y - y.mean() array_functions.plot_line(x,y, title=None ,fig=fig,show=show) if show_histogram: array_functions.plot_histogram(data.x,20) x=1
def make_uniform_data(): X = np.linspace(0, 1, 100) Y = np.zeros(X.size) Y[X < .5] = 0 Y[X >= .5] = 1 X = array_functions.vec_to_2d(X) return X, Y
def train(self, data): I = data.is_train & data.is_labeled x = data.x[I, :] n = x.shape[0] p = x.shape[1] y = data.y[I] x_labeled_transform = self.transform.fit_transform(x) x_all_transform = self.transform.transform(data.x) x_bias = np.hstack((x_labeled_transform, np.ones((n, 1)))) x_all_bias = np.hstack( (x_all_transform, np.ones((x_all_transform.shape[0], 1)))) O = np.eye(p + 1) O[p, p] = 0 x_L = x_all_bias if x_L.shape[0] > self.max_n_L: I_L = np.random.choice(x_L.shape[0], self.max_n_L, replace=False) x_L = x_L[I_L, :] L = self.create_laplacian(x_L) XX = x_bias.T.dot(x_bias) XLX = x_L.T.dot(L).dot(x_L) A = XX + self.C * O + self.C2 * XLX v = np.linalg.lstsq(A, x_bias.T.dot(y)) w_anal = array_functions.vec_to_2d(v[0][0:p]) b_anal = v[0][p] self.w = w_anal self.b = b_anal
def plot_g(self): x = np.linspace(0,1) x = array_functions.vec_to_2d(x) g_orig = self.g_learner.predict_g(x) g = 1 / (1+g_orig) array_functions.plot_2d(x,g) pass
def plot_target(self): x = np.linspace(0,1) x = array_functions.vec_to_2d(x) d = data_lib.Data() d.x = x d.y = np.nan*np.ones(x.shape[0]) d.is_regression = True o = self.target_learner.predict(d) array_functions.plot_2d(x, o.y)
def train_on_data(x,y,domain_ids,learner): domain_ids = np.squeeze(domain_ids) data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.y = y data.set_train() data.set_true_y() data.set_target() x_plot = np.zeros((0,1)) y_plot = np.zeros(0) ids_plot = np.zeros(0) density_plot = np.zeros(0) x_test = scipy.linspace(x.min(),x.max(),100) x_test = array_functions.vec_to_2d(x_test) data_test = data_class.Data() data_test.is_regression = True data_test.x = x_test data_test.y = np.zeros(x_test.shape[0]) data_test.y[:] = np.nan from methods import density kde = density.KDE() max_n = 200.0 for i in np.unique(domain_ids): I = domain_ids == i data_i = data.get_subset(I) if data_i.n > max_n: data_i = data_i.rand_sample(max_n/data_i.n) learner.train_and_test(data_i) o = learner.predict(data_test) x_plot = np.vstack((x_plot,x_test)) y_plot = np.hstack((y_plot,o.y)) ids_plot = np.hstack((ids_plot,np.ones(100)*i)) ''' kde.train_and_test(data_i) dens = kde.predict(data_test) dens.y = dens.y / dens.y.max() den_y = dens.y ''' dens_y = np.ones(data_test.n) density_plot = np.hstack((density_plot,dens_y)) return x_plot,y_plot,ids_plot,density_plot
def train_on_data(x, y, domain_ids, learner): domain_ids = np.squeeze(domain_ids) data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.y = y data.set_train() data.set_true_y() data.set_target() x_plot = np.zeros((0, 1)) y_plot = np.zeros(0) ids_plot = np.zeros(0) density_plot = np.zeros(0) x_test = scipy.linspace(x.min(), x.max(), 100) x_test = array_functions.vec_to_2d(x_test) data_test = data_class.Data() data_test.is_regression = True data_test.x = x_test data_test.y = np.zeros(x_test.shape[0]) data_test.y[:] = np.nan from methods import density kde = density.KDE() max_n = 200.0 for i in np.unique(domain_ids): I = domain_ids == i data_i = data.get_subset(I) if data_i.n > max_n: data_i = data_i.rand_sample(max_n / data_i.n) learner.train_and_test(data_i) o = learner.predict(data_test) x_plot = np.vstack((x_plot, x_test)) y_plot = np.hstack((y_plot, o.y)) ids_plot = np.hstack((ids_plot, np.ones(100) * i)) ''' kde.train_and_test(data_i) dens = kde.predict(data_test) dens.y = dens.y / dens.y.max() den_y = dens.y ''' dens_y = np.ones(data_test.n) density_plot = np.hstack((density_plot, dens_y)) return x_plot, y_plot, ids_plot, density_plot
def create_and_save_data(x, y, domain_ids, file): data = data_class.Data() data.x = array_functions.vec_to_2d(x) data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True data.data_set_ids = domain_ids helper_functions.save_object(file, data)
def set_synthetic_classification(self): self.loss_function = loss_function.ZeroOneError() self.data_dir = 'data_sets/synthetic_classification' self.data_name = 'synthetic_classification' self.data_set_file_name = 'split_data.pkl' self.results_dir = 'synthetic_classification' self.target_labels = np.asarray([1, 2]) #self.target_labels = array_functions.vec_to_2d(self.target_labels).T self.source_labels = np.asarray([3, 4]) self.source_labels = array_functions.vec_to_2d(self.source_labels).T self.cv_loss_function = loss_function.LogLoss()
def set_synthetic_classification(self): self.loss_function = loss_function.ZeroOneError() self.data_dir = 'data_sets/synthetic_classification' self.data_name = 'synthetic_classification' self.data_set_file_name = 'split_data.pkl' self.results_dir = 'synthetic_classification' self.target_labels = np.asarray([1,2]) #self.target_labels = array_functions.vec_to_2d(self.target_labels).T self.source_labels = np.asarray([3,4]) self.source_labels = array_functions.vec_to_2d(self.source_labels).T self.cv_loss_function = loss_function.LogLoss()
def predict(self, data): o = self.target_learner.predict(data) is_target = data.is_target o_source = self.source_learner.predict(data.get_subset(is_target)) if not data.is_regression: assert o.fu.ndim == 2 else: assert np.squeeze(o.fu).ndim == 1 assert np.squeeze(o_source.fu).ndim == 1 o.fu = o.fu.reshape((o.fu.size,1)) o_source.fu = o_source.fu.reshape((o_source.fu.size,1)) for i in range(o.fu.shape[1]): fu_t = o.fu[is_target,i] fu_s = o_source.fu[:,i] if self.g_learner is not None: pred = self.g_learner.combine_predictions(data.x[is_target,:],fu_s,fu_t) if data.x.shape[1] == 1: x = scipy.linspace(data.x.min(),data.x.max(),100) x = array_functions.vec_to_2d(x) g = self.g_learner.predict_g(x) o.x = x o.g = g else: pred = np.multiply(fu_t,1-self.g) + np.multiply(fu_s,self.g) o.fu[is_target,i] = pred #o.fu[is_target] = np.multiply(o.fu[is_target],(1-self.g)) + np.multiply(self.g,o_source.fu) if data.is_regression: o.y = o.fu else: fu = array_functions.replace_invalid(o.fu,0,1) fu = array_functions.normalize_rows(fu) o.fu = fu o.y = fu.argmax(1) if data.x.shape[1] == 1: x = array_functions.vec_to_2d(scipy.linspace(data.x.min(),data.x.max(),100)) o.linspace_x = x o.linspace_g = self.g_learner.predict_g(x) assert not (np.isnan(o.y)).any() assert not (np.isnan(o.fu)).any() return o
def create_concrete(transfer=False): file = 'concrete/Concrete_Data.csv' used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = '' if transfer: feat_ind = 0 domain_ind = (used_field_names == 'age').nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)] #0,3,5 #data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = '-feat=' + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = '-' + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) #viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_concrete(transfer=False): file = "concrete/Concrete_Data.csv" used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = "" if transfer: feat_ind = 0 domain_ind = (used_field_names == "age").nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)] # 0,3,5 # data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = "-feat=" + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = "-" + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) # viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def viz_features(x,y,domain_ids,feature_names=None,alpha=.1,learner=None): #y = array_functions.normalize(y) x = array_functions.vec_to_2d(x) for i in range(x.shape[1]): xi = x[:,i] xi_train = xi yi = y ids_i = domain_ids title = str(i) density = None if feature_names is not None: title = str(i) + ': ' + feature_names[i] if learner is not None: xi,yi,ids_i,density = train_on_data(xi,yi,domain_ids,learner) density = density*100 + 1 I = array_functions.is_invalid(density) density[I] = 200 alpha = 1 array_functions.plot_2d_sub(xi,yi,alpha=alpha,title=title,data_set_ids=ids_i,sizes=density) k = 1 array_functions.plot_histogram(xi_train,100) k=1
def create_bike_sharing(): file = "bike_sharing/day.csv" columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype="string") all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns) domain_ind = used_field_names == "yr" domain_ids = np.squeeze(bike_data[:, domain_ind]) # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') # bike_data = bike_data[:,inds_to_keep] # used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: # learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ("-feat=" + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_boston_housing(file_dir=""): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y if create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) # viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % "" elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ("-" + str(p)) else: assert False else: s %= "" if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_bike_sharing(): file = 'bike_sharing/day.csv' columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype='string') all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns) domain_ind = used_field_names == 'yr' domain_ids = np.squeeze(bike_data[:, domain_ind]) #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') #bike_data = bike_data[:,inds_to_keep] #used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: #learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ('-feat=' + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_boston_housing(file_dir=''): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y create_transfer_data = False create_y_split = True if create_y_split: from base import transfer_project_configs as configs_lib pc = configs_lib.ProjectConfigs() main_configs = configs_lib.MainConfigs(pc) learner = main_configs.learner learner.quiet = True learner.target_learner[0].quiet = True learner.source_learner.quiet = True learner.g_learner.quiet = False domain_ids = array_functions.bin_data(data.y, num_bins=2) data.data_set_ids = domain_ids data.is_train[:] = True corrs = [] for i in range(x.shape[1]): corrs.append(scipy.stats.pearsonr(x[:, i], y)[0]) learner.train_and_test(data) print 'Just playing with data - not meant to save it' for i, name in enumerate(data.feature_names): v = learner.g_learner.g[i] if abs(v) < 1e-6: v = 0 print name + ': ' + str(v) exit() elif create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) #viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % '' elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ('-' + str(p)) else: assert False else: s %= '' if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def predict(self, data): x = array_functions.vec_to_2d(data.x) x = array_functions.vec_to_2d(x) y = self.model.pdf(x) o = results.Output(data, y) return o