def train(self, data): assert data.is_regression y_s, y_true = self.get_predictions(data) I = data.is_target & data.is_labeled #y_s = y_s[I] y_s = data.y[data.is_source] y_true = data.true_y[I] x_s = data.x[data.is_source] x_s = array_functions.append_column(x_s, data.y[data.is_source]) x_s = array_functions.standardize(x_s) x_t = data.x[I] x_t = array_functions.append_column(x_t, data.y[I]) x_t = array_functions.standardize(x_t) Wrbf = array_functions.make_rbf(x_t, self.sigma, self.metric, x2=x_s) S = array_functions.make_smoothing_matrix(Wrbf) w = cvx.Variable(x_s.shape[0]) constraints = [w >= 0] reg = cvx.norm(w)**2 loss = cvx.sum_entries( cvx.power( S*cvx.diag(w)*y_s - y_true,2 ) ) obj = cvx.Minimize(loss + self.C*reg) prob = cvx.Problem(obj,constraints) assert prob.is_dcp() try: prob.solve() #g_value = np.reshape(np.asarray(g.value),n_labeled) w_value = w.value except: k = 0 #assert prob.status is None print 'CVX problem: setting g = ' + str(k) print '\tsigma=' + str(self.sigma) print '\tC=' + str(self.C) w_value = k*np.ones(x_s.shape[0]) all_data = data.get_transfer_subset(self.configs.labels_to_keep,include_unlabeled=True) all_data.instance_weights = np.ones(all_data.n) all_data.instance_weights[all_data.is_source] = w.value self.instance_weights = all_data.instance_weights self.target_learner.train_and_test(all_data) self.x = all_data.x[all_data.is_source] self.w = all_data.instance_weights[all_data.is_source]
def create_wine(data_to_create=WINE_RED): red_file = "wine/winequality-red.csv" white_file = "wine/winequality-white.csv" field_names, red_data = load_csv(red_file, delim=";") white_data = load_csv(white_file, delim=";")[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() # learner = None viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner) suffix = "transfer" else: if data_to_create == WINE_RED: wine_data = red_data suffix = "red" elif data_to_create == WINE_WHITE: wine_data = white_data suffix = "white" else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass # feat_idx = 1 # data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True """ data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) """ s = wine_file % ("-" + suffix) helper_functions.save_object(s, data)
def create_concrete(transfer=False): file = 'concrete/Concrete_Data.csv' used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = '' if transfer: feat_ind = 0 domain_ind = (used_field_names == 'age').nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)] #0,3,5 #data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = '-feat=' + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = '-' + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) #viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_concrete(transfer=False): file = "concrete/Concrete_Data.csv" used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = "" if transfer: feat_ind = 0 domain_ind = (used_field_names == "age").nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)] # 0,3,5 # data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = "-feat=" + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = "-" + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) # viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_bike_sharing(): file = "bike_sharing/day.csv" columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype="string") all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns) domain_ind = used_field_names == "yr" domain_ids = np.squeeze(bike_data[:, domain_ind]) # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') # bike_data = bike_data[:,inds_to_keep] # used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: # learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ("-feat=" + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_boston_housing(file_dir=""): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y if create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) # viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % "" elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ("-" + str(p)) else: assert False else: s %= "" if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_bike_sharing(): file = 'bike_sharing/day.csv' columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype='string') all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns) domain_ind = used_field_names == 'yr' domain_ids = np.squeeze(bike_data[:, domain_ind]) #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') #bike_data = bike_data[:,inds_to_keep] #used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: #learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ('-feat=' + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_boston_housing(file_dir=''): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y create_transfer_data = False create_y_split = True if create_y_split: from base import transfer_project_configs as configs_lib pc = configs_lib.ProjectConfigs() main_configs = configs_lib.MainConfigs(pc) learner = main_configs.learner learner.quiet = True learner.target_learner[0].quiet = True learner.source_learner.quiet = True learner.g_learner.quiet = False domain_ids = array_functions.bin_data(data.y, num_bins=2) data.data_set_ids = domain_ids data.is_train[:] = True corrs = [] for i in range(x.shape[1]): corrs.append(scipy.stats.pearsonr(x[:, i], y)[0]) learner.train_and_test(data) print 'Just playing with data - not meant to save it' for i, name in enumerate(data.feature_names): v = learner.g_learner.g[i] if abs(v) < 1e-6: v = 0 print name + ': ' + str(v) exit() elif create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) #viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % '' elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ('-' + str(p)) else: assert False else: s %= '' if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def create_wine(data_to_create=WINE_RED): red_file = 'wine/winequality-red.csv' white_file = 'wine/winequality-white.csv' field_names, red_data = load_csv(red_file, delim=';') white_data = load_csv(white_file, delim=';')[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() #learner = None viz_features(x, y, ids, used_field_names, alpha=.01, learner=learner) suffix = 'transfer' else: if data_to_create == WINE_RED: wine_data = red_data suffix = 'red' elif data_to_create == WINE_WHITE: wine_data = white_data suffix = 'white' else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass #feat_idx = 1 #data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True ''' data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) ''' s = wine_file % ('-' + suffix) helper_functions.save_object(s, data)
def train(self, data): assert data.is_regression y_s, y_true = self.get_predictions(data) I_target = data.is_target I_target_labeled = data.is_target & data.is_labeled & data.is_train y_s = data.y[I_target_labeled] y_true = data.true_y[I_target_labeled] x = array_functions.standardize(data.x) x_t = x[I_target] x_tl = x[I_target_labeled] C = self.C C2 = self.C2 W_ll = array_functions.make_rbf(x_tl, self.sigma, self.metric) W_ll_reg_inv = np.linalg.inv(W_ll+C2*np.eye(W_ll.shape[0])) W_ul = array_functions.make_rbf(x_t, self.sigma, self.metric, x2=x_tl) R_ll = W_ll.dot(W_ll_reg_inv) R_ul = W_ul.dot(W_ll_reg_inv) assert not array_functions.has_invalid(R_ll) assert not array_functions.has_invalid(R_ul) reg = lambda gh: SMSTransfer.reg(gh, R_ul) #f = lambda gh: SMSTransfer.eval(gh, R_ll, R_ul, y_s, y_true, C, reg) f = SMSTransfer.eval jac = SMSTransfer.gradient g0 = np.zeros((R_ll.shape[0] * 2, 1)) gh_ids = np.zeros(g0.shape) gh_ids[R_ll.shape[0]:] = 1 maxfun = np.inf maxitr = np.inf constraints = [] options = { 'disp': False, 'maxiter': maxitr, 'maxfun': maxfun } method = 'L-BFGS-B' #R_ll = np.eye(R_ll.shape[0]) #R_ul = np.eye(R_ll.shape[0]) #y_s = 1*np.ones(y_s.shape) #y_true = 1*np.ones(y_s.shape) args = (R_ll, R_ul, y_s, y_true, C, reg) results = optimize.minimize( f, g0, method=method, jac=jac, options=options, constraints=constraints, args=args ) check_results = False if check_results: results2 = optimize.minimize( f, g0, method=method, jac=None, options=options, constraints=constraints, args=args ) print self.params scipy_opt_methods.compare_results(results, results2, gh_ids) diff = results.x-results2.x print results.x print results2.x g, h = SMSTransfer.unpack_gh(results.x, R_ll.shape[0]) self.opt_succeeded = results.success if not results.success: print 'SMS Opt failed' data.R_ul = R_ul self.g = g self.h = h #assert results.success pass