예제 #1
0
파일: gtv.py 프로젝트: abbystvns/swus_gtv
def full_weighted_cv(X, y, Ds, lambda_gtv=np.linspace(.1, 1, 10), lambda_lasso=None, t=50, auto_cv=True, alpha=.9, k=5):
    errors = []
    X_train, X_test, y_train, y_test = temporal_split(X, y, t)
    if alpha<1:
        n = X_train.shape[0]
        weights = np.array([alpha**(n-t) for t in np.arange(1, n+1)])
        X_train = X_train * np.sqrt(weights.reshape(-1,1))
        y_train = y_train * np.sqrt(weights)
    n,p = X_train.shape
    # test errors
    for l1 in lambda_gtv:
        for m in Ds:
            D = Ds[m]
            if auto_cv:
                XD, bigY, invD = augmented_system_lasso(X_train, y_train, D, l1, 0, l1_only=True)
                fit = cvglmnet(x = XD, y = bigY, family = 'gaussian', ptype = 'mse', nfolds = 5)
                b = cvglmnetCoef(fit, s = 'lambda_min')
                l3 = fit['lambda_min'][0]
                beta = [email protected](b.shape[0])[1:]
                mset, r2t = compute_errors(y_train, X_train@beta)
                mse, r2 = compute_errors(y_test, X_test@beta)
                errors.append([m, l1, l3, mset, r2t, mse, r2])
            else:
                for l3 in lambda_lasso:
                    XD, bigY, invD = augmented_system_lasso(X_train, y_train, D, l1/l3, 0, l1_only=True)
                    #XD, bigY, invD = epsilon_system_lasso(X_train, y_train, D, l1)
                    fit = glmnet(x = XD, y = bigY)
                    b = glmnetCoef(fit, s = scipy.float64([l3]), exact = False)
                    beta = [email protected](b.shape[0])[1:]
                    mset, r2t = compute_errors(y_train, X_train@beta)
                    mse, r2 = compute_errors(y_test, X_test@beta)
                    errors.append([m, l1, l3, mset, r2t, mse, r2])
    df = pd.DataFrame(errors, columns=['method', 'lambda_tv', 'lambda_1', 'train_mse', 'train_r2', 'test_mse', 'test_r2'])
    return df
예제 #2
0
파일: pltr.py 프로젝트: panguit/PLTR
    def extract_rules(self,
                      var_name_list=None,
                      lambda_n=None,
                      return_coefs=True):
        """
		Extract all rules for selected model (best_lambda should is used if lanbda_n = None, call with self.lasso_bset_lambda)
		return as a list of string

		INPUT :
		lambda_n : int or None, index of lambda selected. If none, the self.lasso_best_lambda is used.
		var_name_list : initial variable names.
		return_coefs : bool, if True, a dataframe with corresponding coefficients is returned.
		RETURN : String list, with rules
		"""

        # If the index of lambda is given, we extract rules from corresponding lasso model
        # Else it is the rules of best_lambda
        sel_lambda = lambda_n

        if sel_lambda is None:
            sel_lambda = self.lasso_best_lambda

        # Conpute the variables where the LASSO coefficient is not null.
        # list of couples of variables ( ex : ['(0,1)','0', ... , '(12,14)'] )
        sel_cols = np.argwhere(
            glmnetCoef(self.lasso_mod)[1:, sel_lambda] != 0
        )[:,
          0]  #start from 1 because the first coef. corresponds to the intercept
        list_var_string = list(self.sel_var_names[sel_cols])

        #Compute rules
        rules = []
        for var_string in list_var_string:
            rule = self.var_to_string(var_string, var_name_list)
            rules.append(rule)

        if return_coefs:
            coefs = glmnetCoef(self.lasso_mod)[1:, sel_lambda][(sel_cols)]
            # return rules,coefs
            # print(coefs)
            # print(len(rules))
            rules = pd.DataFrame(np.array([rules, coefs]).T,
                                 columns=['Rules', 'Lasso Coef.'])
            rules = rules.astype({'Lasso Coef.': 'float'})
        return rules
예제 #3
0
파일: gtv.py 프로젝트: abbystvns/swus_gtv
def weighted_gtv(X, y, D, l1, l3, alpha=.9):
    if alpha<1:
        n = X.shape[0]
        weights = np.array([alpha**(n-t) for t in np.arange(1, n+1)])
        X = X * np.sqrt(weights.reshape(-1,1))
        y = y * np.sqrt(weights)
    XD, bigY, invD = augmented_system_lasso(X, y, D, l1/l3, 0, l1_only=True)
    fit = glmnet(x = XD, y = bigY)
    b = glmnetCoef(fit, s = scipy.float64([l3]), exact = False)
    beta = [email protected](b.shape[0])[1:]
    return beta
예제 #4
0
 def lassotrans(beta, w, y, omega, lam, eta, offset):
     n = y.shape[0]
     expterm = np.exp(np.matmul(w, beta)-offset - np.matmul(np.matmul(np.transpose(beta), omega), beta)/2)
     omegabeta =   np.matmul(omega, beta)
     omegabeta = omegabeta[np.newaxis, :]
     dLbeta = -(np.matmul(np.transpose(y),  w) - np.matmul(np.transpose(expterm), (w - omegabeta)))/n
     Y =  p *  np.sqrt(eta/2) * (beta - dLbeta/(eta))
     X =  p *  np.eye(p) * np.sqrt(eta/2)
     fit = glmnet(x = scipy.float64(X), y = scipy.float64(Y),  lambdau = scipy.float64([lam]), intr = False)
     beta = np.array(glmnetCoef(fit, s= scipy.float64([0])))[1:, 0]#clf.coef_
     return beta
예제 #5
0
def elastic_coxph(x, surv, cen, x_names, alp=False, lam=False):
    y = np.stack([surv, 1 - cen], axis=1)

    lam = [0.1,0.5,1]
    alp = [0.1,0.5,1]

    result = []
    for a in tqdm(alp):
        fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='cox', alpha=a, nlambda=100)

        for l in lam:
            beta = glmnetCoef.glmnetCoef(fit, s=scipy.float64([l]), exact=False)
            beta = beta.flatten()
            try:
                features = x_names[np.array([int(i) for i, e in enumerate(beta) if e != 0])]
            except:
                features = []
            result.append((a, l, features, beta))
    return result
예제 #6
0
def cvglmnetCoef(obj, s=None):
    
    if s is None or len(s) == 0:
        s = obj['lambda_1se']
        
    if isinstance(s, scipy.ndarray):
        lambdau = s
    elif isinstance(s, str):
        sbase = ['lambda_1se', 'lambda_min']
        indxtf = [x.startswith(s.lower()) for x in sbase] # find index of family in fambase
        sind= [i for i in range(len(indxtf)) if indxtf[i] == True]
        s = sbase[sind[0]]
        lambdau = obj[s]
    else:
        raise ValueError('Invalid form of s')
        
    result = glmnetCoef(obj['glmnet_fit'], lambdau)
    
    return result
예제 #7
0
    def fit(self, train_x, train_y, feature_names):
        self._feature_names = feature_names

        print("Start fitting LDA...")
        tic = time.time()
        self.lda = LatentDirichletAllocation(n_components=self.n_topics, learning_method='online', \
                                             random_state=self.seed, n_jobs=8)
        thetas = self.lda.fit_transform(
            train_x
        )  # train_x: n_samples * n_features --> thetas: n_samples * n_topics
        toc = time.time()
        print("Finish fitting LDA... time spent {} seconds.".format(toc - tic))

        # Find beta. Modified from George's demo.
        print("Start fitting CoxPH...")
        tic = time.time()
        fit = glmnet(
            x=thetas.copy(),
            y=train_y.copy(),
            family='cox',
            alpha=self._alpha,
            standardize=False,  # we performed our own standardization
            intr=False)
        self.beta = glmnetCoef(fit, s=np.array([self._lambda])).flatten()
        toc = time.time()
        print("Finish fitting CoxPH... time spent {} seconds.".format(toc -
                                                                      tic))

        observed_times = train_y[:, 0]
        event_indicators = train_y[:, 1]
        # For each observed time, how many times the event occurred
        event_counts = Counter()
        for t, r in zip(observed_times, event_indicators):
            event_counts[t] += int(r)
        # Sorted list of observed times
        self.sorted_unique_times = np.sort(list(event_counts.keys()))
        self.num_unique_times = len(self.sorted_unique_times)
        self.log_baseline_hazard = np.zeros(self.num_unique_times)
예제 #8
0
파일: pltr.py 프로젝트: panguit/PLTR
 def lasso_coef(self):
     res = glmnetCoef(self.lasso_mod)
     if self.lasso_best_lambda is not None:
         res = res[:, self.lasso_best_lambda]
     return res
예제 #9
0
t = scipy.ones((50, 1), dtype=scipy.float64)
wts = scipy.row_stack((t, 2 * t))

# call glmnet
fit = glmnet.glmnet(x = x.copy(), y = y.copy(), family = 'gaussian', \
                    weights = wts, \
                    alpha = 0.2, nlambda = 20
                    )

glmnetPrint.glmnetPrint(fit)
glmnetPlot.glmnetPlot(fit, xvar='lambda', label=True)
glmnetPlot.glmnetPlot(fit, xvar='dev', label=True)
#
any(fit['lambdau'] == 0.5)
#
coefApprx = glmnetCoef.glmnetCoef(fit, s=scipy.float64([0.5]), exact=False)
print(coefApprx)
#
fc = glmnetPredict.glmnetPredict(fit, x[0:5,:], ptype = 'response', \
                                s = scipy.float64([0.05]))
print(fc)
#
cvfit = cvglmnet.cvglmnet(x=x.copy(), y=y.copy(), ptype='mse', nfolds=20)
cvfit['lambda_min']
cvglmnetCoef.cvglmnetCoef(cvfit, s='lambda_min')
#%%
cvglmnetPredict.cvglmnetPredict(cvfit, newx=x[0:5, ], s='lambda_min')

#%%
foldid = scipy.random.choice(10, size=y.shape[0], replace=True)
    # Reweighting columns by coeff. randomization
    Xmod = np.zeros((M_tmp, N))

    Xmod[:, w_on] = w * Xbs[:, w_on]
    Xmod[:, w_off] = Xbs[:, w_off]

    fit = glmnet(x=Xmod,
                 y=Ybs,
                 family='gaussian',
                 alpha=1.0,
                 maxit=10**8,
                 intr=False,
                 standardize=False,
                 thresh=1e-10,
                 lambdau=np.array([0.02, lambda1]))
    glmnet_ret = glmnetCoef(fit)

    betaV[w_on, nexp] = w * glmnet_ret[:, 1][1:NEXP + 1][w_on]
    betaV[w_off, nexp] = glmnet_ret[:, 1][1:NEXP + 1][w_off]

endTime = time.time()
t2 = endTime - startTime

print([t1, t2])  # elapsed time

# Mean value of beta
plt.figure(1)
_, NEXP = betaV.shape
plt.scatter(fit_AMPR_beta,
            np.mean(betaV[:, 0:NEXP], 1),
            color='blue',
예제 #11
0
importlib.reload(cvglmnetPlot)
importlib.reload(cvglmnetPredict)

# parameters
baseDataDir = '../data/'

# load data
x = scipy.loadtxt(baseDataDir + 'PoissonExampleX.dat',
                  dtype=scipy.float64,
                  delimiter=',')
y = scipy.loadtxt(baseDataDir + 'PoissonExampleY.dat',
                  dtype=scipy.float64,
                  delimiter=',')

# call glmnet
fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='poisson')

glmnetPlot.glmnetPlot(fit)

glmnetCoef.glmnetCoef(fit, s=scipy.float64([1.0]))

f = glmnetPredict.glmnetPredict(fit,
                                x[0:5, :],
                                ptype='response',
                                s=scipy.float64([0.1, 0.01]))
print(f)

cvfit = cvglmnet.cvglmnet(x.copy(), y.copy(), family='poisson')
optlam = scipy.array([cvfit['lambda_min'], cvfit['lambda_1se']]).reshape(2, )
cvglmnetCoef.cvglmnetCoef(cvfit, s=optlam)
예제 #12
0
for i in range(int(len(test_sample) / chunksize)):  #looping avoids MemoryError
    predictions[(i * chunksize):((i + 1) * chunksize), :] = glmnetPredict(
        fit,
        all_counts[test_sample[(i * chunksize):((i + 1) * chunksize)], :],
        ptype='response')

predictions[((i + 1) * chunksize):, :] = glmnetPredict(
    fit, all_counts[test_sample[((i + 1) * chunksize):], :], ptype='response')

for i in range(num_s):
    test_auc = metrics.roc_auc_score(
        pt_data['notes'].loc[test_sample].surv_12mo, predictions[:, i])
    print(i, fit['lambdau'][i], fit['df'][i], test_auc)

best_lambda_i = 30
coefs = glmnetCoef(fit, s=scipy.float64([fit['lambdau'][best_lambda_i]
                                         ]))[1:].flatten()
features = pd.DataFrame({
    'feature': count_vect.get_feature_names(),
    'coef': coefs
})
features.loc[:, 'coef_abs'] = abs(features.coef)
features_sorted = features.sort_values(by='coef_abs', ascending=False)
features_sorted = features_sorted.reset_index()

selected_terms = features.feature.loc[abs(coefs) > 0].tolist()
selected_terms_frame = pd.DataFrame({'feature': selected_terms})
selected_terms_frame.loc[:, 'exclude'] = 0
selected_terms_frame.to_csv(output_dir + 'models/' +
                            model_config['model_name'] +
                            '/text_features_unedited.csv',
                            index=False)
예제 #13
0
        glmnetPredict(fit1, scipy.empty([0]), scipy.empty([0]),
                      'coefficients'))

    fit2 = glmnet.glmnet(x=x.copy(), y=g2.copy(), family='binomial')
    print(glmnetPredict(fit2, x[2:5, :], scipy.empty([0]), 'response'))
    print(glmnetPredict(fit2, scipy.empty([0]), scipy.empty([0]), 'nonzero'))

    fit3 = glmnet.glmnet(x=x.copy(), y=g4.copy(), family='multinomial')
    print(glmnetPredict(fit3, x[0:3, :], scipy.array([0.01]), 'response'))
    print(glmnetPredict(fit3, x[0:3, :], scipy.array([0.01, 0.5]), 'response'))

elif section == 8:
    x = scipy.random.rand(100, 20)
    y = scipy.random.rand(100, 1)
    fit = glmnet.glmnet(x=x.copy(), y=y.copy())
    ncoef = glmnetCoef(fit, scipy.array([0.01, 0.001]))

elif section == 9:
    scipy.random.seed(1)
    x = scipy.random.normal(size=(100, 20))
    y = scipy.random.normal(size=(100, 1))
    g2 = scipy.random.choice(2, size=(100, 1)) * 1.0
    g4 = scipy.random.choice(4, size=(100, 1)) * 1.0

    plt.figure()
    fit1 = cvglmnet(x=x.copy(), y=y.copy())
    cvglmnetPlot(fit1)

    plt.figure()
    fit2 = cvglmnet(x=x.copy(), y=g2.copy(), family='binomial')
    cvglmnetPlot(fit2)
예제 #14
0
    def fit(self,
            train_x,
            train_y,
            feature_names,
            duration_col='LOS',
            event_col='OUT'):
        """
        Given the train dataset, we firstly use glmnet to find the beta (for
        regression). Then we calculate the log baseline hazard (implemented by
        George, modified by Ren).

        :param train_df: DataFrame, with the duration and the event column
        :param duration_col: the column name for duration
        :param event_col: the column name for event
        """
        train_df = pd.DataFrame(data=train_x, columns=feature_names)
        train_df[duration_col] = train_y[:, 0]
        train_df[event_col] = train_y[:, 1]

        self._feature_names = feature_names
        self._duration_col = duration_col
        self._event_col = event_col

        train_df = self._standardize_df(train_df, flag='train')
        train_y = train_df[[duration_col, event_col]].values
        train_x = train_df.drop(columns=[duration_col, event_col]).values

        # Find beta. Modified from George's demo.
        fit = glmnet(
            x=train_x.copy(),
            y=train_y.copy(),
            family='cox',
            alpha=self._alpha,
            standardize=False,  # we performed our own standardization
            intr=False)
        self.beta = glmnetCoef(fit, s=np.array([self._lambda])).flatten()

        # self.beta = cph_kera(x = train_x.copy(), y = train_y.copy(), \
        #     alpha = self._alpha, lmbda = self._lambda, standardize = True)

        observed_times = train_y[:, 0]
        event_indicators = train_y[:, 1]
        # For each observed time, how many times the event occurred
        event_counts = Counter()
        for t, r in zip(observed_times, event_indicators):
            event_counts[t] += int(r)
        # Sorted list of observed times
        self.sorted_unique_times = np.sort(list(event_counts.keys()))
        self.num_unique_times = len(self.sorted_unique_times)
        self.log_baseline_hazard = np.zeros(self.num_unique_times)
        # Calculate the log baseline hazard. Implemented by George.
        for time_idx, t in enumerate(self.sorted_unique_times):
            logsumexp_args = []
            for subj_idx, observed_time in enumerate(observed_times):
                if observed_time >= t:
                    logsumexp_args.append(
                        np.inner(self.beta, train_x[subj_idx]))
            if event_counts[t] > 0:
                self.log_baseline_hazard[time_idx] = \
                    np.log(event_counts[t]) - logsumexp(logsumexp_args)
            else:
                self.log_baseline_hazard[time_idx] = \
                    -np.inf - logsumexp(logsumexp_args)
예제 #15
0
    print(glmnetPredict(fit1, x[0:5, :], np.array([0.01, 0.005])))
    print(glmnetPredict(fit1, np.empty([0]), np.empty([0]), 'coefficients'))

    fit2 = glmnet.glmnet(x=x.copy(), y=g2.copy(), family='binomial')
    print(glmnetPredict(fit2, x[2:5, :], np.empty([0]), 'response'))
    print(glmnetPredict(fit2, np.empty([0]), np.empty([0]), 'nonzero'))

    fit3 = glmnet.glmnet(x=x.copy(), y=g4.copy(), family='multinomial')
    print(glmnetPredict(fit3, x[0:3, :], np.array([0.01]), 'response'))
    print(glmnetPredict(fit3, x[0:3, :], np.array([0.01, 0.5]), 'response'))

elif section == 8:
    x = np.random.rand(100, 20)
    y = np.random.rand(100, 1)
    fit = glmnet.glmnet(x=x.copy(), y=y.copy())
    ncoef = glmnetCoef(fit, np.array([0.01, 0.001]))

elif section == 9:
    np.random.seed(1)
    x = np.random.normal(size=(100, 20))
    y = np.random.normal(size=(100, 1))
    g2 = np.random.choice(2, size=(100, 1)) * 1.0
    g4 = np.random.choice(4, size=(100, 1)) * 1.0

    plt.figure()
    fit1 = cvglmnet(x=x.copy(), y=y.copy())
    cvglmnetPlot(fit1)

    plt.figure()
    fit2 = cvglmnet(x=x.copy(), y=g2.copy(), family='binomial')
    cvglmnetPlot(fit2)
예제 #16
0
importlib.reload(cvglmnetPlot)
importlib.reload(cvglmnetPredict)

# parameters
baseDataDir = '../data/'

# load data
x = np.loadtxt(baseDataDir + 'PoissonExampleX.dat',
               dtype=np.float64,
               delimiter=',')
y = np.loadtxt(baseDataDir + 'PoissonExampleY.dat',
               dtype=np.float64,
               delimiter=',')

# call glmnet
fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='poisson')

glmnetPlot.glmnetPlot(fit)

glmnetCoef.glmnetCoef(fit, s=np.float64([1.0]))

f = glmnetPredict.glmnetPredict(fit,
                                x[0:5, :],
                                ptype='response',
                                s=np.float64([0.1, 0.01]))
print(f)

cvfit = cvglmnet.cvglmnet(x.copy(), y.copy(), family='poisson')
optlam = np.array([cvfit['lambda_min'], cvfit['lambda_1se']]).reshape(2, )
cvglmnetCoef.cvglmnetCoef(cvfit, s=optlam)
y, X = dmatrices('price ~' + 'yearOfRegistration+powerPS+kilometer+C(notRepairedDamage)+C(fuelType)+C(gearbox)+C(vehicleType)+C(brand)+C(model)', df_categorical,
                 return_type='dataframe')


min_max_scaler_x1 = preprocessing.MinMaxScaler()
x1 = min_max_scaler_x1.fit_transform(X)
min_max_scaler_y1 = preprocessing.MinMaxScaler()
y1 = min_max_scaler_y1.fit_transform(y)

fit1 = glmnet(x=x1.copy(), y=y1.copy(), family='gaussian',
              weights=wts,
              alpha=1, nlambda=100
              )
from glmnetCoef import glmnetCoef
c = glmnetCoef(fit1)
c = c[1:, -1]  # remove intercept and get the coefficients at the end of the path
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 15))
h = glmnetPlot(fit1, xvar='lambda', label=False)
# /r/a/p/usr/lib64/python3.5/site-packages/glmnet_python/glmnetPlot.py
ax1 = h['ax1']
xloc = plt.xlim()
xloc = xloc[0]

index = h['index']
xpos = min(index)

labels = X.columns.tolist()
for i in range(len(c)):
    ax1.text(1 / 2 * xpos + 1 / 2 * xloc, c[i], labels[i])
예제 #18
0
importlib.reload(glmnetPlot)
importlib.reload(glmnetPrint)
importlib.reload(glmnetCoef)
importlib.reload(glmnetPredict)

importlib.reload(cvglmnet)
importlib.reload(cvglmnetCoef)
importlib.reload(cvglmnetPlot)
importlib.reload(cvglmnetPredict)

# parameters
baseDataDir = '../data/'

# load data
x = scipy.loadtxt(baseDataDir + 'CoxExampleX.dat',
                  dtype=scipy.float64,
                  delimiter=',')
y = scipy.loadtxt(baseDataDir + 'CoxExampleY.dat',
                  dtype=scipy.float64,
                  delimiter=',')

print(y[0:5, :])

# call glmnet
fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='cox')

glmnetPlot.glmnetPlot(fit)

c = glmnetCoef.glmnetCoef(fit, s=scipy.float64([0.05]))
print(c)