def train(self, data):
        assert data.is_regression
        y_s, y_true = self.get_predictions(data)
        I = data.is_target & data.is_labeled
        #y_s = y_s[I]
        y_s = data.y[data.is_source]
        y_true = data.true_y[I]

        x_s = data.x[data.is_source]
        x_s = array_functions.append_column(x_s, data.y[data.is_source])
        x_s = array_functions.standardize(x_s)
        x_t = data.x[I]
        x_t = array_functions.append_column(x_t, data.y[I])
        x_t = array_functions.standardize(x_t)
        Wrbf = array_functions.make_rbf(x_t, self.sigma, self.metric, x2=x_s)
        S = array_functions.make_smoothing_matrix(Wrbf)
        w = cvx.Variable(x_s.shape[0])
        constraints = [w >= 0]
        reg = cvx.norm(w)**2
        loss = cvx.sum_entries(
            cvx.power(
                S*cvx.diag(w)*y_s - y_true,2
            )
        )
        obj = cvx.Minimize(loss + self.C*reg)
        prob = cvx.Problem(obj,constraints)
        assert prob.is_dcp()
        try:
            prob.solve()
            #g_value = np.reshape(np.asarray(g.value),n_labeled)
            w_value = w.value
        except:
            k = 0
            #assert prob.status is None
            print 'CVX problem: setting g = ' + str(k)
            print '\tsigma=' + str(self.sigma)
            print '\tC=' + str(self.C)
            w_value = k*np.ones(x_s.shape[0])

        all_data = data.get_transfer_subset(self.configs.labels_to_keep,include_unlabeled=True)
        all_data.instance_weights = np.ones(all_data.n)
        all_data.instance_weights[all_data.is_source] = w.value
        self.instance_weights = all_data.instance_weights
        self.target_learner.train_and_test(all_data)

        self.x = all_data.x[all_data.is_source]
        self.w = all_data.instance_weights[all_data.is_source]
예제 #2
0
def create_wine(data_to_create=WINE_RED):
    red_file = "wine/winequality-red.csv"
    white_file = "wine/winequality-white.csv"
    field_names, red_data = load_csv(red_file, delim=";")
    white_data = load_csv(white_file, delim=";")[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            # learner = None
            viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner)
        suffix = "transfer"
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = "red"
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = "white"
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        # feat_idx = 1
        # data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    """
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    """
    s = wine_file % ("-" + suffix)
    helper_functions.save_object(s, data)
예제 #3
0
def create_concrete(transfer=False):
    file = 'concrete/Concrete_Data.csv'
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ''
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == 'age').nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)]
        #0,3,5
        #data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = '-feat=' + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = '-' + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids,
                     used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    #viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
예제 #4
0
def create_concrete(transfer=False):
    file = "concrete/Concrete_Data.csv"
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ""
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == "age").nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)]
        # 0,3,5
        # data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = "-feat=" + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = "-" + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    # viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
예제 #5
0
def create_bike_sharing():
    file = "bike_sharing/day.csv"
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype="string")
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns)
    domain_ind = used_field_names == "yr"
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    # bike_data = bike_data[:,inds_to_keep]
    # used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        # learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ("-feat=" + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
예제 #6
0
def create_boston_housing(file_dir=""):
    boston_data = datasets.load_boston()
    data = data_class.Data()
    data.x = boston_data.data
    data.y = boston_data.target
    data.feature_names = list(boston_data.feature_names)

    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True
    s = boston_housing_raw_data_file
    x = data.x
    y = data.y
    if create_transfer_data:
        x_ind = 5
        domain_ind = 12
        domain_ids = np.ones(x.shape[0])
        domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4)
        x = np.delete(x, domain_ind, 1)
        # viz_features(x,y,domain_ids,boston_data.feature_names)
        data.data_set_ids = domain_ids

        if boston_num_feats == 1:
            data.x = data.x[:, x_ind]
            data.x = array_functions.vec_to_2d(data.x)
            s = s % ""
        elif boston_num_feats >= data.x.shape[1]:
            data.x = array_functions.standardize(data.x)
            p = min(boston_num_feats, data.x.shape[1])
            s = s % ("-" + str(p))
        else:
            assert False
    else:
        s %= ""
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
예제 #7
0
def create_bike_sharing():
    file = 'bike_sharing/day.csv'
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype='string')
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns)
    domain_ind = used_field_names == 'yr'
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    #bike_data = bike_data[:,inds_to_keep]
    #used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        #learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ('-feat=' + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
예제 #8
0
def create_boston_housing(file_dir=''):
    boston_data = datasets.load_boston()
    data = data_class.Data()
    data.x = boston_data.data
    data.y = boston_data.target
    data.feature_names = list(boston_data.feature_names)

    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True
    s = boston_housing_raw_data_file
    x = data.x
    y = data.y
    create_transfer_data = False
    create_y_split = True
    if create_y_split:
        from base import transfer_project_configs as configs_lib
        pc = configs_lib.ProjectConfigs()
        main_configs = configs_lib.MainConfigs(pc)
        learner = main_configs.learner
        learner.quiet = True
        learner.target_learner[0].quiet = True
        learner.source_learner.quiet = True
        learner.g_learner.quiet = False
        domain_ids = array_functions.bin_data(data.y, num_bins=2)
        data.data_set_ids = domain_ids
        data.is_train[:] = True
        corrs = []
        for i in range(x.shape[1]):
            corrs.append(scipy.stats.pearsonr(x[:, i], y)[0])
        learner.train_and_test(data)
        print 'Just playing with data - not meant to save it'
        for i, name in enumerate(data.feature_names):
            v = learner.g_learner.g[i]
            if abs(v) < 1e-6:
                v = 0
            print name + ': ' + str(v)
        exit()
    elif create_transfer_data:
        x_ind = 5
        domain_ind = 12
        domain_ids = np.ones(x.shape[0])
        domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4)
        x = np.delete(x, domain_ind, 1)
        #viz_features(x,y,domain_ids,boston_data.feature_names)
        data.data_set_ids = domain_ids

        if boston_num_feats == 1:
            data.x = data.x[:, x_ind]
            data.x = array_functions.vec_to_2d(data.x)
            s = s % ''
        elif boston_num_feats >= data.x.shape[1]:
            data.x = array_functions.standardize(data.x)
            p = min(boston_num_feats, data.x.shape[1])
            s = s % ('-' + str(p))
        else:
            assert False
    else:
        s %= ''
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
예제 #9
0
def create_wine(data_to_create=WINE_RED):
    red_file = 'wine/winequality-red.csv'
    white_file = 'wine/winequality-white.csv'
    field_names, red_data = load_csv(red_file, delim=';')
    white_data = load_csv(white_file, delim=';')[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            #learner = None
            viz_features(x,
                         y,
                         ids,
                         used_field_names,
                         alpha=.01,
                         learner=learner)
        suffix = 'transfer'
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = 'red'
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = 'white'
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        #feat_idx = 1
        #data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    '''
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    '''
    s = wine_file % ('-' + suffix)
    helper_functions.save_object(s, data)
    def train(self, data):
        assert data.is_regression
        y_s, y_true = self.get_predictions(data)
        I_target = data.is_target
        I_target_labeled = data.is_target & data.is_labeled & data.is_train
        y_s = data.y[I_target_labeled]
        y_true = data.true_y[I_target_labeled]

        x = array_functions.standardize(data.x)
        x_t = x[I_target]
        x_tl = x[I_target_labeled]

        C = self.C
        C2 = self.C2

        W_ll = array_functions.make_rbf(x_tl, self.sigma, self.metric)
        W_ll_reg_inv = np.linalg.inv(W_ll+C2*np.eye(W_ll.shape[0]))
        W_ul = array_functions.make_rbf(x_t, self.sigma, self.metric, x2=x_tl)
        R_ll = W_ll.dot(W_ll_reg_inv)
        R_ul = W_ul.dot(W_ll_reg_inv)
        assert not array_functions.has_invalid(R_ll)
        assert not array_functions.has_invalid(R_ul)
        reg = lambda gh: SMSTransfer.reg(gh, R_ul)
        #f = lambda gh: SMSTransfer.eval(gh, R_ll, R_ul, y_s, y_true, C, reg)
        f = SMSTransfer.eval
        jac = SMSTransfer.gradient

        g0 = np.zeros((R_ll.shape[0] * 2, 1))
        gh_ids = np.zeros(g0.shape)
        gh_ids[R_ll.shape[0]:] = 1

        maxfun = np.inf
        maxitr = np.inf
        constraints = []
        options = {
            'disp': False,
            'maxiter': maxitr,
            'maxfun': maxfun
        }
        method = 'L-BFGS-B'
        #R_ll = np.eye(R_ll.shape[0])
        #R_ul = np.eye(R_ll.shape[0])
        #y_s = 1*np.ones(y_s.shape)
        #y_true = 1*np.ones(y_s.shape)
        args = (R_ll, R_ul, y_s, y_true, C, reg)
        results = optimize.minimize(
            f,
            g0,
            method=method,
            jac=jac,
            options=options,
            constraints=constraints,
            args=args
        )
        check_results = False
        if check_results:
            results2 = optimize.minimize(
                f,
                g0,
                method=method,
                jac=None,
                options=options,
                constraints=constraints,
                args=args
            )
            print self.params
            scipy_opt_methods.compare_results(results, results2, gh_ids)
            diff = results.x-results2.x
            print results.x
            print results2.x
        g, h = SMSTransfer.unpack_gh(results.x, R_ll.shape[0])
        self.opt_succeeded = results.success
        if not results.success:
            print 'SMS Opt failed'

        data.R_ul = R_ul
        self.g = g
        self.h = h
        #assert results.success
        pass