def weight_analysis(verbose=0, stack_option='s'):
  logging.info('starting ensemble weight analysis')

  stack = STACK if stack_option == 's' else MODELS

  pool = multiprocessing.Pool(processes=4)
  drivers = settings.DRIVER_IDS#[:1000]
  CUTOFF = -1
  results = pool.map(
      compute_weights,
      map(lambda x: (x, verbose, stack_option), drivers)
  )

  predictions = {}
  for i, get_data, model, _ in stack:
    predictions[i] = np.array(list(itertools.chain(*[r[1][i] for r in results])))
  testY = list(itertools.chain(*[r[2] for r in results]))

  model_names = [
      ('%s.%s.%s' % (get_data.func_name, model.__name__, i), i)
      for i, get_data, model, repeat in stack
  ]
  model_names.sort(key=lambda x: x[0])
  keys = [x[1] for x in model_names]
  model_names = [x[0] for x in model_names]

  lasso = Lasso(alpha=0.0, positive=True)
  trainX = []
  for row_id in xrange(len(testY)):
    train_row = [predictions[i][row_id] for i in keys]
    trainX.append(train_row)

  a, b = trainX[:CUTOFF], trainX[CUTOFF:]
  c, d = testY[:CUTOFF], testY[CUTOFF:]
  lasso.fit(a, c)
  pred = lasso.predict(b)
  pred_train = lasso.predict(a)
  #logging.info('auc: %s' % util.compute_auc(d, pred))

  logging.info('coefficients:')
  weights = {}
  for i, name in enumerate(model_names):
    logging.info('%s: %.3f' % (model_names[i], lasso.coef_[i]))
    weights[keys[i]] = lasso.coef_[i]

  logging.info('individual scores:')
  for i, key in enumerate(keys):
    logging.info('%s: %.3f' % (
        model_names[i],
        util.compute_auc(testY, predictions[key])
    ))

  logging.info('weights dictionary: %s' % weights)

  # and again in the end, so you don't have to scroll
  logging.info('------------')
  #logging.info('auc: %s' % util.compute_auc(d, pred))
  logging.info('auc train: %s' % util.compute_auc(c, pred_train))
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
Exemplo n.º 3
0
def lassoRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Lasso Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    lassoRegression = Lasso(alpha=1e-7)
    lassoRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = lassoRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-lassoRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Exemplo n.º 4
0
def traverse_movies_lasso():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 3695:
			model = Lasso(alpha = .05)
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			#P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	#print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
def reg_skl_lasso(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    lasso = Lasso(alpha=param["alpha"], normalize=True)
    lasso.fit(X_tr, y_reg_tr)
    pred = lasso.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
Exemplo n.º 6
0
def lassoreg(a):
    print ("Doing lasso regression")
    clf2 = Lasso(alpha=a)
    clf2.fit(base_X, base_Y)
    print ("Score = %f" % clf2.score(base_X, base_Y))
    clf2_pred = clf2.predict(X_test)
    write_to_file("lasso.csv", clf2_pred)
def calc_linear_regression(files, data_matrix, target, results):

    lr = Lasso()
    lr.fit(data_matrix, target)

    rss = np.mean((lr.predict(data_matrix) - target) ** 2)
    var = lr.score(data_matrix, target)

    global best
    if rss < best:
        for i in range(0,len(target)):
            print str(target[i]) + "\t" + str(lr.predict(data_matrix[i])[0])
        print lr.coef_
        best = rss

    results.append((files, rss, var, lr.coef_))
Exemplo n.º 8
0
    def classify(self):
        """Perform classification"""
        clf = Lasso(max_iter=10000000)
        #parameters = {'alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1,5.0,10.0]}
        #clf = GridSearchCV(lasso, parameters,scoring='roc_auc')

        clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
        self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
Exemplo n.º 9
0
class Linear():
    def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \
            epsilon=0.1):
        self.limit = limit
        if type == 'Ridge':
            self.model = Ridge(alpha=alpha)
        elif type == 'SVR':
            self.model = SVR(kernel='linear', C=C, epsilon=epsilon)
        elif type == 'NuSVR':
            self.model = NuSVR(C=C, nu=nu, kernel='linear')
        elif type == 'Lasso':
            self.model = Lasso(alpha=alpha)
        
    @staticmethod
    def get_cal(m):
        # get calitative features
        # watch out as indices depend on feature vector!
        return np.hstack((m[:,:23], m[:,24:37], m[:,38:52])) + 1
    
    @staticmethod
    def get_cant(m):
        # get cantitative features
        # watch out as indices depend on feature vector!
        return np.hstack((m[:,23:24], m[:,37:38], m[:,52:]))
        
    def fit(self, train_X, train_Y):
        # no fitting done here, just saving data
        if self.limit:
            if len(train_X) > self.limit:
                train_X = train_X[-self.limit:]
                train_Y = train_Y[-self.limit:]
        self.train_X = np.array(train_X)
        self.train_Y = np.array(train_Y)
        
        
    def predict(self, test_X):
        # fitting done here
        # not efficient on the long term
        test_X = np.array(test_X)
        enc = OneHotEncoder()
        scal = MinMaxScaler()
        data = np.vstack((self.train_X, test_X))
        enc.fit(self.get_cal(data))
        scal.fit(self.get_cant(data))
        
        new_train_X1 = enc.transform(self.get_cal(self.train_X))
        new_train_X2 = scal.transform(self.get_cant(self.train_X))
        new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2))
        new_test_X1 = enc.transform(self.get_cal(test_X))
        new_test_X2 = scal.transform(self.get_cant(test_X))
        new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2))
        
        self.model.fit(new_train_X, self.train_Y)
        R = self.model.predict(new_test_X)
        return R
Exemplo n.º 10
0
def test_lasso_regression():
	datafile_viper = '../data_viper/viper.pkl'
	viper = loadfile(datafile_viper)

	from sklearn.linear_model import Lasso

	model = Lasso(alpha=1e-3)
	model.fit(viper.train_feat, viper.train_y)

	y_pred = model.predict(viper.test_feat)
	print 'testing error {}'.format(abs_error(y_pred, viper.test_y)) 
def main(folds = 5):
    print "folds: ", folds
    #read in  data, parse into training and target sets
    print "\n ------------------Load file --------------- \n"
    train = np.loadtxt(sys.argv[1]).T
    min_max_scaler = preprocessing.MinMaxScaler()
    train = min_max_scaler.fit_transform(train)
	#test data set
    xtest = train[100:112, :]
    train = train[0:100, :]
    print "Size of read data: ", train.shape
    #train = imputation_missingValue(train)
    print "After Standardization:"
    print train
  
    target = np.loadtxt(sys.argv[2]).T
    ytest = target[100:112, :]
    target = target[0:100,:]
    print "Size of read data: ", target.shape

    al = 0.3
    rf = Lasso(alpha=al)
	
    #Simple K-Fold cross validation.
    cv = cross_validation.KFold(len(train), folds)
    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    i = 0
    min_MSE = sys.maxint
    best_train = -1
    best_test = -1
    for traincv, testcv in cv:
        start = timeit.default_timer()
        i += 1
        print i, "epoch"
        rf.fit(train[traincv], target[traincv])
        prediction = rf.predict(train[testcv])
        MSE = mean_squared_error(target[testcv], prediction)
        print "MSE: ", MSE, " for ",i
        if min_MSE > MSE:
            best_train = traincv
            best_test = testcv
            min_MSE = MSE
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
        results.append(MSE)
        stop = timeit.default_timer()
	print "Program running time: ", stop - start 
    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() ), "for folds: ", folds
    print "Results for independent data: ", mean_squared_error(rf.fit(train[best_train], target[best_train]).predict(xtest), ytest)
    print "R squared:"
    print "alpha:", al
def fit_predict_model(l1_penalty):
    RSS = np.zeros((len(l1_penalty)))
    num_nonzero_coeff = np.zeros((len(l1_penalty)))
    idx = 0
    for l1_penalty_choice in l1_penalty:
        model = Lasso(alpha=l1_penalty_choice, normalize=True)
        model.fit(training[all_features], training['price'])
        predicted_price = model.predict(validation[all_features])
        RSS[idx] = np.sum((predicted_price - validation['price'])**2)
        num_nonzero_coeff[idx] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
        idx += 1
    return (RSS, num_nonzero_coeff, model)
Exemplo n.º 13
0
def lasso_regression(alpha):
    #Fit the model
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
    lassoreg.fit(A_x, A_y)
    y_pred = lassoreg.predict(A_x)
    
   #Return the result in pre-defined format
    rss = sum((y_pred-A_y)**2)
    ret = [rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret
def lasso_regression(data, predictors, alpha):
    #Fit the model
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
    lassoreg.fit(data[predictors],data['TransformedLife'])
    y_pred = lassoreg.predict(data[predictors])
    
    #Return the result in pre-defined format
    rss = sum((y_pred-data['TransformedLife'])**2)
    ret = [rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret
Exemplo n.º 15
0
def linearReg():
    sl=Lasso(alpha=0.2)

    sl.fit(features_array,values_array)

    predict_val=sl.predict(features_array)

    print(sl.coef_)
    print(sl.score(features_array,values_array))

    fig = plt.figure()
    ax = plt.subplot(111)
    ax.bar(range(0,features.shape[1]),sl.coef_)
    plt.show()
Exemplo n.º 16
0
def Lasso_model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    real_train_tar=np.expm1(train_linear_tar)
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl')
    return test_prediction_lasso
Exemplo n.º 17
0
def comparaison_moindres_carres(X,Y):
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
    clf_lasso = Lasso(selection='random', random_state=random.seed())
    clf_ridge = Ridge()
    clf_reg_lin = LinearRegression(n_jobs=-1)
    clf_lasso.fit(X_train,Y_train)
    clf_ridge.fit(X_train,Y_train)
    clf_reg_lin.fit(X_train,Y_train)
    Y_lasso=clf_lasso.predict(X_test)
    Y_ridge=clf_ridge.predict(X_test)
    Y_reg_lin=clf_reg_lin.predict(X_test)
    err_lasso=mean_squared_error(Y_test,Y_lasso)
    err_ridge=mean_squared_error(Y_test,Y_ridge)
    err_reg_lin=mean_squared_error(Y_test,Y_reg_lin)
    print("Erreur de Lasso={:1.2f}\nErreur de Ridge={:1.2f}\nErreur de regression lineaire={:1.2f}\n".format(err_lasso,err_ridge,err_reg_lin))
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x]) 

    # randomforest
    forst = RandomForestRegressor(n_estimators=100)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # SVM
    svr = svm.SVR(C=14, epsilon=.43, kernel='linear')
    svr.fit(train.ix[:, lass_only], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
        
    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([.7]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models= ['sand_las_prds', 'sand_rdg_prds', 
             'sand_for_prds', 'sand_for_prds', 'sand_svr_prds'] 
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models= ['Ca_las_prds', 'Ca_rdg_prds', 
             'Ca_for_prds', 'Ca_for_prds',  ] 
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
Exemplo n.º 20
0
def lasso_regression(data,target,alphas):
    plt.figure()
    mean_rmses=[]
    kf=KFold(len(target),10,True,None)
    for alpha0 in alphas:
        rmses=[]
        clf=Lasso(alpha=alpha0,normalize=True)
        for train_index, test_index in kf:
            data_train,data_test=data[train_index],data[test_index]
            target_train,target_test=target[train_index],target[test_index]
            clf.fit(data_train,target_train)
#            print(clf.sparse_coef_)
            rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
            rmses.append(rmse)
        mean_rmses.append(np.mean(rmses))
        x0=np.arange(1,11)
        plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o')
        
    lr = linear_model.LinearRegression(normalize = True)
    rmses = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        rmses.append(rmse)
    mean_rmses.append(np.mean(rmses))
    x0=np.arange(1,11)
    plt.plot(x0,rmses,label='linear',marker='*')
    
    plt.title("RMSE comparison between different alpha values of Lasso regularization")
    plt.xlabel("cross validation indices")
    plt.ylabel("RMSE")
    plt.legend()
    plt.show()
        
    return mean_rmses
        alphas = [1e-10, 1e-7, 1e-5, 1e-3, 1e-1, 1]
        xtrain, xtest, ytrain, ytest = train_test_split(resultx,
                                                        resulty,
                                                        train_size=0.9,
                                                        random_state=22)
        xtrain = np.reshape(xtrain, [-1, len(usedcolumns)])
        xtest = np.reshape(xtest, [-1, len(usedcolumns)])
        general_error = []
        for i, alpha in enumerate(alphas):
            mses = []
            #do cross validation to find the best alpha
            for trains, valids in KFold(4, shuffle=True).split(
                    range(xtrain.shape[0])):
                lreg = Lasso(alpha=alpha, normalize=True)
                lreg.fit(xtrain[trains], ytrain[trains])
                y_pred = lreg.predict(xtrain[valids])
                mses.append(mse(y_pred, ytrain[valids]))
            general_error.append(np.mean(mses))
            #using the entire training dataset to fit the Lasso model with alpha x
        indexs2 = np.argmin(general_error)
        best_alpha = alphas[int(indexs2)]
        lreg2 = Lasso(alpha=best_alpha, normalize=True)
        lreg2.fit(xtrain, ytrain)
        y_pred2 = lreg2.predict(xtrain)

        #record these data
        mseg.append(mse(y_pred2, ytrain))
        R2.append(r2_score(y_pred2, ytrain))
        models.append(lreg2)
        test_set.append([xtest, ytest])
        bests_alpha.append(best_alpha)
Exemplo n.º 22
0
def run_stack(SEED):



    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns.values.tolist()
    columnsHighScore = trainBase.columns.values.tolist()


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 0
    avgLast = -1
    NumFolds = 5 


    clf = Lasso(alpha=0.00010) # found with tune_lasso.py

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    
    featuresRemaining = []
    avgScore = []    
    
    
    while True:
        print(clf)
        avg = 0
    
        coef_dataset = np.zeros((len(columns),NumFolds))
   
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
 
  
            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

                 
            coef_dataset[:, foldCount] = clf.coef_                 

            foldCount = foldCount + 1
        

     
        
        coefs = coef_dataset.mean(1)        
        sorted_coefs = sorted(map(abs, coefs)) # must start by removing coefficients closest to zero.
        print(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
        if len(sorted_coefs) < 5 :
            break
        
        threshold = sorted_coefs[5]

        print(str(len(columns)))
        print(trainBase.shape)
        
        toDrop = []        
        
        # hey, cannot drop var11 and id columns          
        for index in range(len(coefs) - 1, -1, -1): # must reverse columns all shift to lower numbers.
            if  abs(coefs[index]) <= threshold and columns[index] != "var11" and columns[index] != "id":# abs(), remove closest to zero.
                print("Drop: " + str(index) + " " + columns[index] + " " + str(coefs[index]))
                #trainBase = np.delete(trainBase,[index], axis=1)
                toDrop.append(index)
               
               
                #print(columns)
                if columns[index] in columns: 
                    columns.remove(columns[index])  
                #print(columns)
        
        print("start drop")
        trainBase = np.delete(trainBase,toDrop, axis=1)      
        print("End drop")        
        
        
        if avg > avgLast:
            print("Saving Copy " + str(avgLast) + " " + str(avg))
            avgLast = avg
            columnsHighScore = columns.copy()

        print("Threshold: " + str(threshold))        
        print ("------------------------Average: " + str(avg))
        print(columnsHighScore)
        print(str(len(columns)))
        print(trainBase.shape)
           
           
        featuresRemaining.append(len(columns))           
        avgScore.append(avg)
           
        #break
    
    
               
    gc.collect()    
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBase = trainBase.loc[:,columnsHighScore]
    trainBase.to_csv("../models/" + str(clf)[:5] +  "_train.csv", index = False)
    
    
    gc.collect()
    test = pd.read_csv('../preprocessdata/pre_departition_test.csv')
    test = test.loc[:,columnsHighScore]
    test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index = False)  
      
      
    print(columnsHighScore)      
    print(featuresRemaining)
    print(avgScore)
Exemplo n.º 23
0
prediction_lgb = np.exp(gbm.predict(test_feature))


#%%
# RandomForestRegressorによる予測
forest = RandomForestRegressor().fit(X_train, y_train)
prediction_rf = np.exp(forest.predict(test_feature))

acc_forest = forest.score(X_train, y_train)
acc_dic.update(model_forest = round(acc_forest,3))
print(f"training dataに対しての精度: {forest.score(X_train, y_train):.2}")

#%%
# lasso回帰による予測
lasso = Lasso().fit(X_train, y_train)
prediction_lasso = np.exp(lasso.predict(test_feature))

acc_lasso = lasso.score(X_train, y_train)
acc_dic.update(model_lasso = round(acc_lasso,3))
print(f"training dataに対しての精度: {lasso.score(X_train, y_train):.2}")

#%%
# ElasticNetによる予測
En = ElasticNet().fit(X_train, y_train)
prediction_en = np.exp(En.predict(test_feature))
print(f"training dataに対しての精度: {En.score(X_train, y_train):.2}")

acc_ElasticNet = En.score(X_train, y_train)
acc_dic.update(model_ElasticNet = round(acc_ElasticNet,3))

#%%
Exemplo n.º 24
0
    coefs2.append(lasso.coef_)

ax2 = plt.gca()
ax2.plot(alphas*2, coefs)
ax2.set_xscale('log')
ax2.set_title('Lasso')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

##Lasso regression with cross-validation##

#LassoCV with 10-fold cross-validation(similar to ISLR)#
lcv = LassoCV(alphas=None, max_iter=100000, normalize=True, cv=kfcv, n_jobs=2)
lcv.fit(X_train, Y_train)

print('\nBest LassoCV alpha value:')
print(lcv.alpha_)

#Ridge regression using best alpha#
lbest = Lasso(alpha=lcv.alpha_, normalize=True)
lbest.fit(X_train, Y_train)

print('\nBest Lasso MSE:')
print(mean_squared_error(Y_test, lbest.predict(X_test)))

print('\nLasso Coeficients:')
print(pd.Series(lbest.coef_, index=xcols))

plt.show()
Exemplo n.º 25
0
def Lasso_Reg(alpha) :
    L1 = Lasso(alpha=alpha, normalize=True)
    L1.fit(X3_train, y3_train)
    pred = L1.predict(X3_test)
    return L1, pred
Exemplo n.º 26
0
#Let us predict the stock market for the Future 30 days
days = 20

data_seed = df['Adj Close'].values[-window_size:][None]

input_values = {
    'Lasso': data_seed,
    'Ridge': data_seed,
    'BayesianRidge': data_seed,
    'ElasticNet': data_seed
}
values = {'Lasso': [], 'Ridge': [], 'BayesianRidge': [], 'ElasticNet': []}

for i in range(days):
    values['Lasso'].append(reg_1.predict(input_values['Lasso'])[0])
    values['Ridge'].append(reg_2.predict(input_values['Ridge'])[0])
    values['BayesianRidge'].append(
        reg_3.predict(input_values['BayesianRidge'])[0])
    values['ElasticNet'].append(reg_4.predict(input_values['ElasticNet'])[0])

    for v in input_values:
        val = input_values[v]
        val = np.insert(val, -1, values[v][-1], axis=1)
        val = np.delete(val, 0, axis=1)
        input_values[v] = val.copy()
for v in input_values:
    values[v] = np.array(values[v])

# Plotting the Predictions of all the four Regressors in sub plots
last_date = datetime.strptime("{:%Y-%m-%d}".format(df.index[-1]), '%Y-%m-%d')
Exemplo n.º 27
0
lassocv = LassoCV(alphas=None,
                  cv=10,
                  max_iter=100000,
                  normalize=False,
                  random_state=1991,
                  positive=True)
lassocv.fit(regressors_train_pca, target_train)

# Fit Lasso model with best alpha
lasso = Lasso(max_iter=10000,
              normalize=False,
              alpha=lassocv.alpha_,
              positive=True)  # a = 17671.398612860448
lasso.fit(regressors_train_pca, target_train)
# Predict on test set
predicted_lasso = lasso.predict(regressors_test_pca)

# RMSE
math.sqrt(mean_squared_error(target_test, predicted_lasso))
# MAE
mean_absolute_error(predicted_lasso, target_test)
#MAPE
np.mean(np.abs((target_test - predicted_lasso) / target_test))

predicted_df_lasso = pd.DataFrame({
    'Predicted_Values':
    list(predicted_lasso.flatten().astype(int)),
    'Actual_Values':
    list(target_test)
}).set_index(
    target_test.index
Exemplo n.º 28
0
cv_scores = cross_val_score(reg,X,y, cv=5)
# Print the 5-fold cross-validation scores
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))



#regularised linear regression

#we will use 2 types of linear regression, ridge and lasso

#lasso regression. Here we will try to predict the data from testdata.csv with lasso regression. Alo we will draw a graph to show the selection of features with lasso regression

lasso = Lasso(alpha = 0.4, normalize = True)
lasso.fit(X,y)
y_pred_all = lasso.predict(X_dummy_all)
print('predicted value using all the features and using lasso regression is : '+str(y_pred_all))

#code to draw the graph
plt.clf()
lasso_coef = lasso.coef_

df_columns = np.array(['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP','BMI_female', 'child_mortality'])

plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns, rotation=60)
plt.margins(0.02)
plt.savefig('lassofig.png')


#ridge regression
Exemplo n.º 29
0
        color="chocolate")
plt.show()
print()

#######
#Ridge#
#######
print('##################')
print('#RIDGE Regression#')
print('##################')
ridge = Ridge().fit(X_train, y_train)

y_predicted_ridge = ridge.predict(X_test)
y_predicted_binary_ridge = [1 if yp >= 0.5 else 0 for yp in y_predicted_ridge]
print('The accuracy score of Ridge Regression is: {:.3f}'.format(
    accuracy_score(y_test_binary, y_predicted_binary_ridge)))
print()

#######
#Lasso#
#######
print('##################')
print('#LASSO Regression#')
print('##################')
lasso = Lasso().fit(X_train, y_train.value.apply(getBinary))

y_predicted_lasso = lasso.predict(X_test)
y_predicted_binary_lasso = [1 if yp >= 0.5 else 0 for yp in y_predicted_lasso]
print('The accuracy score of Lasso Regression is: {:.3f}'.format(
    accuracy_score(y_test_binary, y_predicted_binary_lasso)))
Exemplo n.º 30
0
# Run prediction on the Kaggle test set.
y_pred_xgb = regr.predict(test_df_munged)

################################################################################

from sklearn.linear_model import Lasso

# I found this best alpha through cross-validation.
best_alpha = 0.00099

regr = Lasso(alpha=best_alpha, max_iter=50000)
regr.fit(train_df_munged, label_df)

# Run prediction on training set to get a rough idea of how well it does.
y_pred = regr.predict(train_df_munged)
y_test = label_df
print("Lasso score on training set: ", rmse(y_test, y_pred))

# Run prediction on the Kaggle test set.
y_pred_lasso = regr.predict(test_df_munged)

################################################################################

# Blend the results of the two regressors and save the prediction to a CSV file.

y_pred = (y_pred_xgb + y_pred_lasso) / 2
y_pred = np.exp(y_pred)

pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"])
pred_df.to_csv('output_XGBoost.csv', header=True, index_label='Id')
lasso.fit(boston.data, boston.target)
lasso_coef = lasso.coef_
lasso_coef

# In[39]:

plt.plot(range(13), lasso_coef)
plt.xticks(range(13), boston.feature_names)
plt.ylabel = ('coefficents')
plt.show()

# In[41]:

lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(x_train, y_train)
y_lasso = lasso.predict(x_test)
lasso_mse = mean_squared_error(y_test, y_lasso)
lasso_mse

# In[43]:

from sklearn.linear_model import Ridge

# In[44]:

ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(x_train, y_train)
y_ridge = ridge.predict(x_test)
ridge_mse = mean_squared_error(y_test, y_ridge)
ridge_mse
Exemplo n.º 32
0
# learn scalers. Squishes feature data set into the same scale. Without this the RMSE doubles!!
feature_scaler = RobustScaler(quantile_range=(25, 75)).fit(feature_train)

# perform scaling
feature_train = feature_scaler.transform(feature_train)
feature_test = feature_scaler.transform(feature_test)

#Run Lasso model
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, classification_report, mean_squared_error

alpha = 0.01
lasso = Lasso(alpha=alpha)
lasso.fit(feature_train, target_train)
pred_train_lasso = lasso.predict(feature_train)

#here we're asking the model to predict the results of using the test data set. i.e. if we passed it next years survey numbers, it would predict the happiness score
# We'd need to pass in the features in the same order as we define above: print(lasso.predict([ [10, 10 ] ]))
pred_test_lasso = lasso.predict(feature_test)

#RMSE. Reslts are c.0.25, with the dependancy (happiness) ranging from 2.8-7.5. Therefore RMSE is small (good!)
# print("RMSE train set: ", np.sqrt(mean_squared_error(target_train,pred_train_lasso)))
# print("RMSE test set: ", np.sqrt(mean_squared_error(target_test,pred_test_lasso)))
# #R-squared values. values =0.95, 1 means the model explains the variability in happiness perfectly. So R-squared is high
# print("R-squared train set: ", r2_score(target_train, pred_train_lasso))
# print("R-squared test set: ", r2_score(target_test, pred_test_lasso))

train_score = lasso.score(feature_train, target_train)
test_score = lasso.score(feature_test, target_test)
Exemplo n.º 33
0
        lassoLams = np.logspace(-5,6,23)

        def Learn(lam):
            lasso = Lasso(alpha=lam,fit_intercept=False,copy_X=True,max_iter=5.e3)
            lasso.fit(X_train,y_train)
            return lasso

        
        optLam = ExperimentUtils.gridSearch1D(lassoLams, Learn, Eval, MAX=False,verbose=False)

        
        lasso = Lasso(alpha=optLam,fit_intercept=False,copy_X=True,max_iter=2.5e5)
        lasso.fit(X,y)

        lasso_yhat = np.array([lasso.predict(X_test)]).T
        lasso_mse = sum((y_test - lasso_yhat) ** 2)
        lasso_beta = np.array([lasso.coef_]).T
        lasso_betaLoss = max(abs(lasso_beta - oracleBeta))[0]
        with open(mseFile('LASSO'),'a') as f:
            f.write("%15.10f    " % lasso_mse)
        with open(betaFile('LASSO'),'a') as f:
            f.write("%15.10f    " % lasso_betaLoss)
        with open(lamFile('LASSO'),'a') as f:
            f.write("%15.10f    " % optLam)
        print "LASSO MSE: %f   BETA_LOSS: %f   OPT_LAM: %f" % (lasso_mse, lasso_betaLoss, optLam)


        ############
        ## Oracle ##
        ############
Exemplo n.º 34
0
#tune the lambda parameter by applying k-fold cross validation
kf = KFold(N, n_folds=5)  #produce the k folds

Lambda = np.arange(0.001, 1.0, 0.001)  #a list of lambdas
Prediction_error = []  #an empty list to hold the prediction error

for l in Lambda:  #loop over lambdas
    pe = 0.0  #initialize prediction error
    for train_index, test_index in kf:  #loop over the folds
        X_train, X_test = X[train_index], X[
            test_index]  #create training and test independent variable data
        y_train, y_test = y[train_index], y[
            test_index]  #create training and test dependent variable data

        model = Lasso(l)  #create the model object
        results = model.fit(X_train, y_train)  #fit the model
        pe += sum(
            (model.predict(X_test) - y_test
             )**2)  #predict the test data, compute the error, and add to total
    Prediction_error.append(pe)  #append the prediction error

#run the lasso:
#Lambda = sum(((1.0/np.array(Prediction_error))/sum(1.0/np.array(Prediction_error)))*np.array(Lambda))    #compute lambda as the weighted average
model = Lasso(Lambda[Prediction_error.index(
    min(Prediction_error))])  #generate a model object
results = model.fit(X, y)  #fit the model
for i, j in zip(results.coef_, data[2]):  #loop over results
    print('Lasso:', round(i, 4), '    True',
          round(j, 4))  #print and compare with the truth
Exemplo n.º 35
0
feature_selector = Lasso(alpha=alpha, fit_intercept=True, max_iter=1, \
warm_start=warm_start, positive=False, tol=0.0)

while it_since_min < delta_it:
    i += 1

    print 'fitting'
    sys.stdout.flush()

    feature_selector.fit(data_train, salaries_train)

    print 'predicting'
    sys.stdout.flush()

    salaries_pred = feature_selector.predict(data_valid)/scale_fac
    error = np.average(np.abs(salaries_valid/scale_fac - salaries_pred))
    average_salary = np.average(salaries/scale_fac)
    coeff = feature_selector.coef_
    intercept = feature_selector.intercept_

    it_array = np.append(it_array, [i])
    valid_array = np.append(valid_array, [error])

    plt.clf()
    plt.plot(it_array, valid_array)
    plt.xlabel('iteration count')
    plt.ylabel('mean validation error')
    plt.title('Lasso (linear) Model Selection alpha = ' + str(alpha))
    plt.pause(0.001)
    fig.savefig('../../plots/lasoo_linear_model_' + str(alpha) + '_' + str(error) + '.pdf')
from sklearn.cross_validation import KFold
from sklearn.linear_model import Lasso
import numpy as np
import pylab as pl
from sklearn.datasets import load_boston
#Loading boston datasets 
boston = load_boston()
# Adding a column of 1s for x0 (Regression Design Matrix)
x = np.array([np.concatenate((v,[1])) for v in boston.data])
y = boston.target
# Create linear regression object with a lasso coefficient 0.5
lasso = Lasso(fit_intercept=True, alpha=0.5)
# Train the model using the training set
lasso.fit(x,y)
# predictions p = np.array([lasso.predict(xi) for xi in x])
p = lasso.predict(x)
#plotting real vs predicted data
pl.plot(p, y,'ro')
pl.xlabel('predicted')
pl.title('Lasso Regression, alpha=0.5')
pl.ylabel('real')
pl.grid(True)
pl.show()
#vector of errors
err = p-y
# Dot product of error vector is sum of squared errors
total_error = np.dot(err,err)
#RMSE on training data
rmse_train = np.sqrt(total_error/len(p))
# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
Exemplo n.º 37
0
class MetaModel_Lasso(customRegressor):
    def __init__(self, in_df, models, sub_params, n_folds, qualPow, imputeDict):
        super(MetaModel_Lasso,self).__init__()
        self.qualPow = qualPow
        self.imputeDict = imputeDict
        # self.features = features
        self.models = models
        self.subparams = sub_params
        self.meta = None
        # self.model = self.meta # aliases shallow copied for use with base class
        self.n_folds = n_folds
        self.predBool = False

        from meta_features import impute_shell
        self._imputeVals = impute_shell(qualPow)
        tempDF = self._imputeVals(in_df)
        self.X = tempDF.drop(columns=["SalePrice"]).copy()
        self.y = np.log(tempDF.SalePrice).values.reshape(-1,1)
        self.pipeline_X = self._make_pipe()
        self.pipeline_y = RobustScaler()

    def _make_pipe(self):
        import meta_features as f
        nonePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value="None"), OneHotEncoder(drop="first"))
        zeroPipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto"))
        scalePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), PowerTransformer())

        regressionPipeline = ColumnTransformer([
            ("setNone", nonePipeline, f.fillNone),
            # ("setZero", zeroPipeline, f.fillZeroCat),
            ("transformed", scalePipeline, f.fillZeroCont),
            # ("dictImputed", make_pipeline(self.dictImputer(f.imputeDict),
            #                               OneHotEncoder(drop="first")), list(f.imputeDict.keys())),
            # ("bool", "passthrough", f.imputeBool),
            ("categoricalInts", "passthrough", f.cat_to_int),
            # ("dropped", "drop", f.dropList)
        ], remainder="drop")
        return make_pipeline(regressionPipeline, RobustScaler())


    def genPreds(self,X,y):
        self.predBool = True
        self.model_list = [list() for i in self.models]
        folds = KFold(n_splits = self.n_folds, shuffle=True, random_state=6)

        oob_preds = np.zeros((X.shape[0], len(self.models)))

        for i, model in enumerate(self.models):
            for trainIdx, outIdx in folds.split(X):
                local_model = deepcopy(model)
                self.model_list[i].append(local_model)
                local_model.subset(trainIdx)
                local_model.fitModel(self.subparams[i])
                preds = local_model.predict(X.iloc[outIdx,:])
                oob_preds[outIdx,i] = preds.reshape(-1,)

        self.oob_preds = oob_preds

        # self.meta.fitModel(X, oob_preds, y)

    def fitModel(self,params):
        self._params = params
        self.meta = Lasso(**params)

        if not self.predBool:
            self.genPreds(self.X,self.y)
        
        piped_X = self.pipeline_X.fit_transform(self.X)
        meta_X = np.column_stack([piped_X,self.oob_preds])
        piped_y = self.pipeline_y.fit_transform(self.y)

        self.meta.fit(meta_X,piped_y)



    def getTrainRsquared(self):
        piped_X = self.pipeline_X.transform(self.X)
        meta_X = np.column_stack([piped_X, self.oob_preds])
        piped_y = self.pipeline_y.transform(self.y)
        return self.meta.score(meta_X,piped_y)
        

    def predict(self,X):
        piped_X = self.pipeline_X.transform(self._imputeVals(X))
        pred_Data = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.model_list
        ])
        meta_X = np.column_stack([piped_X,pred_Data])
        preds = self.meta.predict(meta_X)
        return self._invert(preds)
Exemplo n.º 38
0

merged_test_data["CompetitionOpenSinceYear"] = merged_test_data["CompetitionOpenSinceYear"].fillna(merged_test_data["CompetitionOpenSinceYear"].median())

merged_test_data["StoreType"] = merged_test_data["StoreType"].fillna(merged_test_data["StoreType"].mode())
merged_test_data["Assortment"] = merged_test_data["Assortment"].fillna(merged_test_data["Assortment"].mode())

merged_test_data.loc[merged_test_data["StoreType"] == "a", "StoreType"] = 0
merged_test_data.loc[merged_test_data["StoreType"] == "b", "StoreType"] = 1
merged_test_data.loc[merged_test_data["StoreType"] == "c", "StoreType"] = 2
merged_test_data.loc[merged_test_data["StoreType"] == "d", "StoreType"] = 3
merged_test_data.loc[merged_test_data["Assortment"] == "a", "Assortment"] = 0
merged_test_data.loc[merged_test_data["Assortment"] == "b", "Assortment"] = 1
merged_test_data.loc[merged_test_data["Assortment"] == "c", "Assortment"] = 2
merged_test_data.loc[merged_test_data["Assortment"] == "d", "Assortment"] = 3
merged_test_data = merged_test_data.fillna(0)

las = Lasso()
predictors = ['DayOfWeek', 'Date', 'Promo', 'Promo2', 'Promo2SinceYear', 'Assortment', 'StoreType', 'CompetitionDistance']
las.fit(dataset[predictors], dataset["Sales"])
merged_test_data = merged_test_data[merged_test_data.Id != 0]
predictions = las.predict(merged_test_data[predictors])
submission = pd.DataFrame({
        "Id": merged_test_data["Id"].astype(int),
        "Sales": predictions
    })
submission = submission[submission.Id != 0]
submission.to_csv("kaggle.csv", index=False)
#scores = cross_validation.cross_val_score(las, dataset[predictors], dataset["Sales"], cv=3)
#print(scores.mean())
Exemplo n.º 39
0
    # print 'best rmse for reduced problem: {}. alpha = {}'.format(best_reduced.rmse,best_reduced.alpha)

# End Cross validation

else:
    print 'Making test prediction'

    reg = Lasso(0.28)
    reg.fit(x_source_tf, y_source)
    """
    Predict output with chosen features and learned coefficients beta
    """
    # load test data and transform samples
    data_test = data_loader.restore_from_file('test.csv')
    n_samples_test = data_test.shape[0]
    ids_test = data_test[:, 0].reshape(n_samples_test, 1)
    x_test = data_test[:, 1:].reshape(n_samples_test, n_dimensions_x)
    x_test_tf = feature_transform(feature_vec, x_test)

    # predict output
    if transform:
        y_test = reg.predict(x_test_tf).reshape(n_samples_test, 1)

    else:
        y_test = reg.predict(x_test).reshape(n_samples_test, 1)

    # save output
    header = np.array(['Id', 'y']).reshape(1, 2)
    dump_data = np.hstack((ids_test, y_test))
    data_loader.save_to_file('results.csv', dump_data, header)
Exemplo n.º 40
0
#Mean Squared Error, R^2
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(
    y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' %
      (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

#3. LASSO regression model
X = df.iloc[:, :-1].values
y = df[df.columns].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=42)
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
#residual plot
plt.scatter(y_train_pred,
            y_train_pred - y_train,
            c='blue',
            marker='o',
            edgecolor='white',
            label='Training data')
plt.scatter(y_test_pred,
            y_test_pred - y_test,
            c='green',
            marker='s',
            edgecolor='white',
            label='Test data')
plt.title('LASSO regression residual errors')
def run_lasso(X_train, y_train, X_test):
    model = Lasso(alpha=1)
    result = model.fit(X_train,y_train)
    y_predicted = model.predict(X_test)
    return y_predicted
def preprocessing():
    onehotencoder = OneHotEncoder(handle_unknown="ignore")
    categorical_encoded_data = onehotencoder.fit_transform(
        features[CATEGORICAL_FEATURES].values).toarray()

    scaler = StandardScaler()
    scaled_numerical_data = scaler.fit_transform(
        features.drop(CATEGORICAL_FEATURES, axis=1))

    processed_data = np.concatenate(
        (categorical_encoded_data, scaled_numerical_data), axis=1)


data = pd.read_csv('airbnb_data.csv')

features = data.drop(
    ['price', 'listing_url', 'image_url', 'title', 'district'], axis=1)
target = data['price']

features['rating'].fillna(features['rating'].mean(), inplace=True)
features['reviews'].fillna(1, inplace=True)
features['baths'].fillna('1 bath', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(processed_data,
                                                    target,
                                                    test_size=0.3)
reg = Lasso()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
print('Regression score of the model', reg.score(X_test, y_test))
print('Mean absolute error for the model', mean_absolute_error(y_test, pred))
Exemplo n.º 43
0
r2=r2_score(y_test,y_pred)
print('r^2 score=',r2)


# --------------
'''Prediction using Lasso
In this task let's predict the price of the house using a lasso regressor. Check if there is any improvement in the prediction.'''
from sklearn.linear_model import Lasso

# Code starts here
#Instantiate a lasso model 
lasso=Lasso()
#fit model on training data
lasso.fit(X_train,y_train)
#make predictions on test features
lasso_pred=lasso.predict(X_test)
print('lasso test features predictions-',lasso_pred)
#Find the r^2 score
r2_lasso=r2_score(y_test,lasso_pred)
print('r^2 score lasso=',r2_lasso)


# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge=Ridge()
ridge.fit(X_train,y_train)
ridge_pred=ridge.predict(X_test)
print(ridge_pred)
r2_ridge=r2_score(y_test,ridge_pred)
#Seeing the Coefficients
list(zip(x_train.columns,ridge.coef_))


#Lasso regression
alphas=np.linspace(0.0001,1,100)
rmse_list=[]
for a in alphas:
    lasso = Lasso(fit_intercept=True,alpha=a,max_iter=10000)
    
    #Computing RMSE using 10-fold cross validation
    kf=KFold(len(x_train),n_folds=10)
    xval_err=0
    for train, test in kf:
        lasso.fit(x_train.loc[train],y_train[train])
        p=lasso.predict(x_train.loc[test])
        error = p - y_train[test]
        #xval_err += np.dot(err,err)
        xval_err += mean_squared_error()
    
    #rmse_10cv=np.sqrt(xval_err/len(x_train))
    mse_10cv=xval_err/10
    #Uncomment below to print rmse values for individual alphas
    #print('{:.3f}\t {:.6f}\t '.format(a,rmse_10cv))
    mse_list.extend([mse_10cv])
best_alpha=alphas[rmse_list==min(mse_list)]
print('Alpha with min 10cv error is: ',best_alpha)



#Prediction

from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)

y_ridge = ridge_reg.predict(X)
lin_mse_ridge = mean_squared_error(y, y_ridge)
lin_rmse_ridge = np.sqrt(lin_mse_ridge)
lin_rmse_ridge

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)

y_lasso = lasso_reg.predict(X)
lin_mse_lasso = mean_squared_error(y, y_lasso)
lin_rmse_lasso = np.sqrt(lin_mse_lasso)
lin_rmse_lasso


from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)

y_elastic_net = elastic_net.predict(X)
lin_mse_elastic_net = mean_squared_error(y, y_elastic_net)
lin_rmse_elastic_net = np.sqrt(lin_mse_elastic_net)
lin_rmse_elastic_net

Exemplo n.º 46
0
def mult_reg(p_x, p_y):
    """
    Funcion para ajustar varios modelos lineales

    Parameters
    ----------

    p_x: pd.DataFrame with regressors or predictor variables

    p_y: pd.DataFrame with variable to predict

    Returns
    -------
    r_models: dict Diccionario con modelos ajustados

    """
    xtrain, xtest, ytrain, ytest = train_test_split(p_x,
                                                    p_y,
                                                    test_size=.8,
                                                    random_state=455)

    # fit linear regression
    linreg = LinearRegression(normalize=False, fit_intercept=False)
    linreg.fit(xtrain, ytrain)
    y_p_linear = linreg.predict(xtest)

    # Fit RIDGE regression
    ridgereg = Ridge(normalize=True)
    model = ridgereg.fit(xtrain, ytrain)
    y_p_ridge = model.predict(xtest)

    # Fit LASSO regression
    lassoreg = Lasso(normalize=True)
    lassoreg.fit(xtrain, ytrain)
    y_p_lasso = lassoreg.predict(xtest)

    # Fit ElasticNet regression
    enetreg = ElasticNet(normalize=True)
    enetreg.fit(xtrain, ytrain)
    y_p_enet = enetreg.predict(xtest)

    # RSS = residual sum of squares

    # Return the result of the model
    r_models = {
        "summary": {
            "linear rss": sum((y_p_linear - ytest)**2),
            "Ridge rss": sum((y_p_ridge - ytest)**2),
            "lasso rss": sum((y_p_lasso - ytest)**2),
            "elasticnet rss": sum((y_p_enet - ytest)**2)
        },
        "test": ytest,
        'linear': {
            'rss': sum((y_p_linear - ytest)**2),
            'predict': y_p_linear,
            'model': linreg,
            'intercept': linreg.intercept_,
            'coef': linreg.coef_
        },
        'ridge': {
            'rss': sum((y_p_ridge - ytest)**2),
            'predict': y_p_ridge,
            'model': ridgereg,
            'intercept': ridgereg.intercept_,
            'coef': ridgereg.coef_
        },
        'lasso': {
            'rss': sum((y_p_lasso - ytest)**2),
            'predict': y_p_lasso,
            'model': lassoreg,
            'intercept': lassoreg.intercept_,
            'coef': lassoreg.coef_
        },
        'elasticnet': {
            'rss': sum((y_p_enet - ytest)**2),
            'predict': y_p_enet,
            'model': enetreg,
            'intercept': enetreg.intercept_,
            'coef': enetreg.coef_
        }
    }

    return r_models
Exemplo n.º 47
0
    def compute_abundancies(self):
        def bin_numbers(mzs):
            return (mzs * 200).astype(np.int32)

        isotope_patterns = [self.mz_list[f][a] for f in self.sum_formulae for a in self.adducts]
        all_mzs = np.concatenate([pattern[0] for pattern in isotope_patterns])
        # append 'infinity' so that searchsorted always returns indices less than the length
        all_mz_int_indices = np.concatenate((np.unique(bin_numbers(all_mzs)), [np.iinfo(np.int32).max]))

        def sparse_matrix_from_spectra(data, assume_presence=False):
            intensity_list = []
            row_list = []
            len_list = []
            for j, (mzs, intensities) in enumerate(data):
                int_mzs = bin_numbers(mzs)
                idx = all_mz_int_indices.searchsorted(int_mzs)
                if not assume_presence:
                    known = np.where(all_mz_int_indices[idx] == int_mzs)[0]
                    intensities = intensities[known]
                    idx = idx[known]
                    length = len(known)
                else:
                    length = len(mzs)
                intensity_list.append(intensities)#/np.linalg.norm(intensities))
                row_list.append(idx)
                len_list.append(length)

            intensities = np.concatenate(intensity_list)
            rows = np.concatenate(row_list)
            columns = np.repeat(np.arange(len(data), dtype=np.int32), len_list)
            result = ssp.coo_matrix((intensities, (rows, columns)),
                                     shape=(len(all_mz_int_indices), len(data)),
                                     dtype=float)
            return result

        #print self.sum_formulae
        logging.info("computing Y matrix")
        Y = sparse_matrix_from_spectra([(s.mzs, s.intensities) for s in self.spectra])
        #print Y.nnz, Y.shape
        logging.info("computing D matrix")
        D = sparse_matrix_from_spectra(isotope_patterns, assume_presence=True)
        #print D.nnz, D.shape

        n_masses, n_molecules = D.shape
        n_spectra = Y.shape[1]

        np.set_printoptions(threshold='nan', linewidth=300, precision=3, suppress=True)
        #print (D.todense() > 0).astype(int)

        neighbors_map = {}
        indices = -1 * np.ones((self.nrows, self.ncols), dtype=int)
        for s in self.spectra:
            x, y = s.coords[:2]
            indices[x, y] = s.index
        for x in xrange(self.nrows):
            for y in xrange(self.ncols):
                neighbors_map[indices[x, y]] = []
                #for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (1, 1), (-1, 1), (1, -1)]:
                for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
                    if 0 <= x + dx < self.nrows and 0 <= y + dy < self.ncols:
                        idx = indices[x + dx, y + dy]
                        if idx == -1:
                            continue
                        neighbors_map[indices[x, y]].append(idx)

        n_pairs = sum(len(x) for x in neighbors_map.values()) / 2

        def w_w0_update_matrix():
            xs = []
            ys = []
            data = []

            # upper part (corresponds to DW + W0)
            for i in xrange(n_spectra):
                y_offset = n_molecules * i
                x_offset = n_masses * i

                ys.append(D.col + y_offset)
                xs.append(D.row + x_offset)
                data.append(D.data)

            ys.append(np.repeat(np.arange(n_spectra) + n_molecules * n_spectra, n_masses))
            xs.append(np.arange(n_masses * n_spectra))
            data.append(np.ones(n_masses * n_spectra))

            # middle part (corresponds to W)
            x_offset = n_masses * n_spectra
            
            ys.append(np.arange(n_molecules * n_spectra))
            xs.append(np.arange(n_molecules * n_spectra) + x_offset)
            data.append(np.ones(n_molecules * n_spectra))

            # lower part (corresponds to the neighbor abundancy differences)
            x_offset = (n_masses + n_molecules) * n_spectra

            for i in neighbors_map:
                for j in neighbors_map[i]:
                    if i > j: continue
                    ys.append(np.arange(n_molecules) + n_molecules * i)
                    xs.append(np.arange(n_molecules) + x_offset)
                    data.append(np.ones(n_molecules))

                    ys.append(np.arange(n_molecules) + n_molecules * j)
                    xs.append(np.arange(n_molecules) + x_offset)
                    data.append(-1 * np.ones(n_molecules))
                    x_offset += n_molecules
            
            xs = np.concatenate(xs)
            ys = np.concatenate(ys)
            data = np.concatenate(data)
            
            result = ssp.coo_matrix((data, (xs, ys)), dtype=float)

            assert result.nnz == (D.nnz + n_masses + n_molecules) * n_spectra + n_molecules * n_pairs * 2
            assert result.shape[0] == (n_molecules + n_masses) * n_spectra + n_pairs * n_molecules
            assert result.shape[1] == n_spectra * (n_molecules + 1)

            return result.tocsc()

        A = w_w0_update_matrix()
        print A.shape, A.nnz

        nz = np.where(Y.sum(axis=0)>0)[1]#.A1
        xs= self.coords[nz,0]
        ys= self.coords[nz,1]
        # FIXME: there must be a simpler way!
        Y = Y.todense().A1.reshape((n_masses, n_spectra)).ravel(order='F')
        print "Y sum:", Y.sum()
        
        z0 = Y+1
        u0 = np.zeros(n_masses * n_spectra)

        z1 = np.zeros(n_molecules * n_spectra)
        u1 = np.zeros(n_molecules * n_spectra)

        z2 = np.zeros(n_pairs * n_molecules)
        u2 = np.zeros(n_pairs * n_molecules)

        from sklearn.linear_model import Lasso, ElasticNet, LinearRegression

        lambda_ = 1.0
        theta = 1e-20
        rho = 1.0

        print lambda_/rho/A.shape[0]

        w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True)
        z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False)
        z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False)

        def w_w0_update():
            rhs = np.concatenate((z0 + 1.0/rho * u0, z1 + 1.0/rho * u1, z2 + 1.0/rho * u2))
            w_w0_lasso.fit(A, rhs)
            w = w_w0_lasso.coef_[:n_molecules*n_spectra]
            w0 = w_w0_lasso.coef_[n_molecules*n_spectra:]
            return w, w0

        def z0_update(Dw_w0, u0):
            tmp = Dw_w0 - 1/rho * u0 - 1/rho
            return 0.5 * (np.sqrt(tmp ** 2 + 4 * Y / rho) + tmp)

        def z1_update(w, u1):
            z1_lasso.fit(ssp.eye(z1.shape[0]), w - 1.0 / rho * u1)
            return z1_lasso.coef_

        def z2_update(diffs, u2):
            z2_ridge.fit(ssp.eye(z2.shape[0]), diffs - 1.0 / rho * u2)
            return z2_ridge.coef_

        def logdot(x, y):
            #if np.any((x>0)&(y==0)):
            #    return -np.inf
            return np.dot(x, np.log(y+1e-32))

        # log-likelihood for the original problem (w, w0 variables) 
        def LL(w, Dw_w0=None, diffs=None, w0=None):
            if Dw_w0 is None or diffs is None:
                assert w0 is not None
                rhs = A.dot(np.hstack((w, w0)))
                Dw_w0 = rhs[:n_masses*n_spectra]
                diffs = rhs[(n_masses+n_molecules)*n_spectra:]
            return logdot(Y, Dw_w0) - Dw_w0.sum() - lambda_ * w.sum() - theta * np.linalg.norm(diffs)**2

        # log-likelihood for the modified problem (variables w, w0, z0, z1, z2, u0, u1, u2)
        def LL_ADMM():
            return logdot(Y, z0) - z0.sum() - lambda_ * z1.sum() - theta * np.linalg.norm(z2)**2 \
                    - np.dot(u0, z0 - Dw_w0_estimate) \
                    - np.dot(u1, z1 - w_estimate) \
                    - np.dot(u2, z2 - diff_estimates) \
                    - rho/2 * np.linalg.norm(z0 - Dw_w0_estimate) ** 2 \
                    - rho/2 * np.linalg.norm(z1 - w_estimate) ** 2 \
                    - rho/2 * np.linalg.norm(z2 - diff_estimates) ** 2

        max_iter = 2000
        rhs = None
        for i in range(max_iter):
            logging.info("w,w0 update")
            w_estimate, w0_estimate = w_w0_update()
            rhs_old = rhs
            rhs = w_w0_lasso.predict(A)
            Dw_w0_estimate = rhs[:n_masses*n_spectra]
            diff_estimates = rhs[(n_masses+n_molecules)*n_spectra:]
            #print "w,w0 update", LL(w_estimate, Dw_w0_estimate, diff_estimates)
            #print w_estimate.reshape((self.nrows, self.ncols))
            #print w0_estimate.reshape((self.nrows, self.ncols))
            logging.info("z0 update")
            #print "LL_ADMM after w updates:", LL_ADMM()
            z_old = np.concatenate((z0, z1, z2))
            z0 = z0_update(Dw_w0_estimate, u0)
            #print np.linalg.norm(z0 - Dw_w0_estimate)
            #print "LL_ADMM after z0 update:", LL_ADMM()
            logging.info("z1 update")
            z1 = z1_update(w_estimate, u1)
            #print np.linalg.norm(z1 - w_estimate)
            #print "LL_ADMM after z1 update:", LL_ADMM()
            #print "z1 update", LL(z1, w0=w0_estimate)
            logging.info("z2 update")
            z2 = z2_update(diff_estimates, u2)

            #print np.linalg.norm(z2 - diff_estimates)
            #print "LL_ADMM after z2 update:", LL_ADMM()
            u_old = np.concatenate((u0, u1, u2))
            u0 += rho * (z0 - Dw_w0_estimate)
            u1 += rho * (z1 - w_estimate)
            u2 += rho * (z2 - diff_estimates)

            if rhs_old is not None:
                z = np.concatenate((z0, z1, z2))
                primal_diff = np.linalg.norm(rhs - z)
                dual_diff = rho * np.linalg.norm(A.T.dot(z - z_old))

                if primal_diff > 10 * dual_diff:
                    rho *= 2
                    print "rho <-", rho
                    w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True)
                    z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False)
                    z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False)
                elif dual_diff > 10 * primal_diff:
                    rho /= 2
                    print "rho <-", rho
                    w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True)
                    z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False)
                    z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False)
                print primal_diff, dual_diff, primal_diff + dual_diff, LL(w_estimate, Dw_w0_estimate, diff_estimates)
        #print D.todense()
        #print (Y-Dw_w0_estimate).reshape((n_masses, n_spectra), order='F')
        print LL(w_estimate, Dw_w0_estimate, diff_estimates)
        print w_estimate.reshape((n_molecules, self.nrows, self.ncols), order='F').sum(axis=(1,2))
        #print w0_estimate.reshape((self.nrows, self.ncols), order='F')
        print self.sum_formulae
Exemplo n.º 48
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=1)

# set the final alpha by using LassoCV
Lambdas = np.logspace(-5, 5, 200)
lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10)
lasso_cv.fit(X_train, y_train)
print('Alpha is:' + str(round(lasso_cv.alpha_, 4)))
lasso = Lasso(alpha=lasso_cv.alpha_)

# predict
lasso.fit(X_train, y_train)
y_predict = lasso.predict(X)
y_test_predict = lasso.predict(X_test)

# model evaluation (MSE,MAE,std_error)
mse_predict = round(mean_squared_error(y_test, y_test_predict), 4)
mae_predict = round(mean_absolute_error(y_test, y_test_predict), 4)
std_error = round(Standard_error(y_test_predict), 4)

coef = []
for i in range(8):
    coef.append((factors[i], round(lasso.coef_[i], 4)))

print('Intercept is:' + str(round(lasso.intercept_, 4)))
print('Estimated coefficients are:' + str(coef))
print('Std Error is:' + str(std_error))
print('MSE is:' + str(mse_predict))
Exemplo n.º 49
0
scaler = preprocessing.StandardScaler().fit(X_train_raw)
X_train_scaled = scaler.transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

## PCA and Feature Selection

'''pca = PCA(n_components=100)  
pca.fit(X_train_scaled)
#print(pca.explained_variance_ratio_) 
X_train_reduced = pca.transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)
'''

pca = PCA(n_components=800)
selection = SelectKBest(k=850)
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
combined_features.fit(X_train_scaled, train_labels.ravel())
#print(pca.explained_variance_ratio_) 
X_train_reduced = combined_features.transform(X_train_scaled)
X_test_reduced = combined_features.transform(X_test_scaled)



## Train final Classifiers
#clf = Ridge(alpha=.5)
clf = Lasso(alpha=.03)
clf.fit(X_train_reduced, Y_train_raw)
Y_predicted = clf.predict(X_test_reduced)

## Save results to csv
np.savetxt('prediction.csv', Y_predicted, fmt='%.5f',delimiter=',')
Exemplo n.º 50
0
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=100)

##################################### - Losso Regression - ##########################################

### Running a LASSO Regressor of set of alpha values and observing how the R-Squared, train_rmse and test_rmse are changing with change in alpha values
train_rmse = []
test_rmse = []
R_sqrd = []
alphas = np.arange(0, 500, 1)
for i in alphas:
    LRM = Lasso(alpha=i, normalize=True, max_iter=500)
    LRM.fit(X_train, y_train)
    R_sqrd.append(LRM.score(X_train, y_train))
    train_rmse.append(np.sqrt(np.mean((LRM.predict(X_train) - y_train)**2)))
    test_rmse.append(np.sqrt(np.mean((LRM.predict(X_test) - y_test)**2)))

# Plotting Alpha vs Train and Test RMSE.
plt.scatter(x=alphas, y=R_sqrd)
plt.xlabel("alpha")
plt.ylabel("R_Squared")
plt.scatter(x=alphas, y=train_rmse)
plt.xlabel("alpha")
plt.ylabel("RMSE")
plt.scatter(x=alphas, y=test_rmse)
plt.xlabel("alpha")
plt.ylabel("RMSE")
plt.legend(("alpha Vs R_Squared", "alpha Vs train_rmse", "alpha Vs test_rmse"))

##Another Way of finding alpha value by using GV but above is best than this
Exemplo n.º 51
0
def LassoLambda(alpha, trainSet, validationSet):
  lassoreg = Lasso(alpha=alpha, normalize=True, max_iter=1e5)
  lassoreg.fit(trainSet.loc[:, 'x':'x_10'], trainSet.loc[:, 'y'])
  predict_lasso = lassoreg.predict(validationSet.loc[:, 'x':'x_10'].values)
  error_rate = np.linalg.norm(predict_lasso - validationSet.loc[:, 'y'], ord=2)
  return error_rate
Exemplo n.º 52
0
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
lasso_regression=GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5)
lasso_regression.fit(X_train, y_train)
print(lasso_regression.best_params_)
print(lasso_regression.best_score_)


# In[88]:


lassoreg = Lasso(1e-08, normalize=True)
lassoreg.fit(X_train, y_train)
lasso_pred = lassoreg.predict(X_test)
print("R-Square Value",r2_score(y_test,lasso_pred))
print("\n")
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print("\n")
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print("\n")
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


# In[89]:


sns.distplot(y_test-lasso_pred)

Exemplo n.º 53
0
X = X[index]
y = y[index]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

#train the model
print("train lr model...")
linear = Lasso(normalize=True, alpha=0.1)
linear = linear.fit(x_train, y_train)
print("train lr model...end ")
gdbr = GradientBoostingRegressor(n_estimators=100)
gdbr = gdbr.fit(x_train, y_train)
print("train gb model... over")
#test the model

y_pred_lr = linear.predict(x_test)
y_pred_gb = gdbr.predict(x_test)

y_pred = (y_pred_gb + y_pred_lr) / 2.0

loss = np.mean(np.square((y_test - y_pred)))
loss = np.power(loss, 0.5)

# loss = mean_squared_error(y_test,y_pred)
print loss

# sys.exit()

result = []

with open("/home/zsc/下载/data_new (3)/CIKM2017_testA/testA.txt", 'r') as f:
Exemplo n.º 54
0
        [target]).astype(np.float32)


x, y = to_xy(df, 'weight')
y = np.reshape(y, y.shape[0])

testx = finaltest.as_matrix().astype(np.float32)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

#feature selection:lasso
from sklearn import metrics
feature_sele = Lasso(random_state=0, alpha=0.1)
feature_sele.fit(x_train, y_train)
pred_lasso = feature_sele.predict(x_test)
score_lasso = np.sqrt(metrics.mean_squared_error(pred_lasso, y_test))

#svm
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(x_train, y_train)
pred_svr = svr.predict(x_test)
score_svr = np.sqrt(metrics.mean_squared_error(pred_svr, y_test))

#knn
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(weights='distance')
knn.fit(x_train, y_train)
pred_knn = knn.predict(x_test)
score_knn = np.sqrt(metrics.mean_squared_error(pred_knn, y_test))
Exemplo n.º 55
0
def random_subset_sampling(num_pts, num_repeats, X, y):
    indices = list(range(0, X.shape[0]))

    model = LassoCV(cv=5, verbose=False, eps=1e-5)
    feat_hist = np.zeros(X.shape[1])
    feat_hist_avg = np.zeros(X.shape[1])
    aicc_cut = 0.001

    # Standardize data
    y = (y - np.mean(y)) / np.std(y)
    for i in range(1, X.shape[1]):
        X[:, i] = (X[:, i] - np.mean(X[:, i])) / np.std(X[:, i])

    tot_num_avg = 0
    for i in range(num_repeats):
        print("Sample {} of {}".format(i, num_repeats))
        shuffle(indices)
        indx = np.array(indices[:num_pts])
        X_sel = X[indx, :]
        y_sel = y[indx]
        model.fit(X_sel, y_sel)
        nonzero = np.nonzero(model.coef_)[0]
        print("Num coeff. {}".format(len(nonzero)))
        feat_hist[nonzero] += 1

        aicc_vals = np.zeros(len(model.alphas_))
        nonzeros = []
        # print(model.alphas_)
        # print(model.alpha_)
        # exit()
        for j in range(len(model.alphas_)):
            m = Lasso(alpha=model.alphas_[j])
            m.fit(X_sel, y_sel)
            coeff = m.coef_
            nonzero = np.nonzero(coeff)[0]
            pred = m.predict(X_sel)
            rmse = np.sqrt(np.mean((y_sel - pred)**2))
            rss = np.sum((y_sel - pred)**2)
            if rmse**2 < 1e-12:
                rmse = 1e-6
            #print("RMSE: {:e}".format(rmse))
            numCoeff = len(nonzero)
            nonzeros.append(nonzero)

            if numCoeff >= X_sel.shape[0] - 1:
                aicc_vals[j] = 1e100
            else:
                aicc_vals[j] = aicc(numCoeff, X_sel.shape[0], rss)

        aicc_vals -= np.min(aicc_vals)
        w = np.exp(-aicc_vals)
        #print(w)
        contribute = np.nonzero(w > aicc_cut)[0]
        print(contribute)
        for idx in contribute:
            feat_hist_avg[nonzeros[idx]] += 1.0
            tot_num_avg += 1

    feat_hist_avg /= tot_num_avg
    feat_hist /= num_repeats

    fname = 'random_partion{}.csv'.format(num_pts)
    np.savetxt(fname, np.vstack((feat_hist_avg, feat_hist)).T, delimiter=',')
    print("Selection data written to {}".format(fname))
    fig = plt.figure()

    ax = fig.add_subplot(1, 1, 1)
    ax.plot(feat_hist_avg, label='Avg')
    ax.plot(feat_hist, label='Opt.')
    ax.legend()

    print("Std avg: {}".format(np.std(feat_hist_avg)))
    print("Std opt: {}".format(np.std(feat_hist)))
    plt.show()
Exemplo n.º 56
0
                            scoring='mean_absolute_error',
                            cv=10)
ozone_ridgecv_reg = ozone_ridgecv_reg.fit(ozone_train.drop('ozone', axis=1),
                                          ozone_train['ozone'])

## Compare regularization models
print("Linear Coef: " + str(ozone_ln_reg.coef_) + "\nRidge Coef: " +
      str(ozone_ridge_reg.coef_) + "\nLasso Coef: " +
      str(ozone_lasso_reg.coef_) + "\nCV Coef: " +
      str(ozone_ridgecv_reg.coef_) + "\nCV alpha: " +
      str(ozone_ridgecv_reg.alpha_))

# Predict using models and evaluate
ozone_ln_pred = ozone_ln_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_ridge_pred = ozone_ridge_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_lasso_pred = ozone_lasso_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_ridgecv_pred = ozone_ridgecv_reg.predict(ozone_test.drop('ozone',
                                                               axis=1))

## Calculate MAE, RMSE, and R-squared for all models
ozone_ln_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ln_pred)
ozone_ln_rmse = sqrt(
    metrics.mean_squared_error(ozone_test['ozone'], ozone_ln_pred))
ozone_ln_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ln_pred)

ozone_ridge_mae = metrics.mean_absolute_error(ozone_test['ozone'],
                                              ozone_ridge_pred)
ozone_ridge_rmse = sqrt(
    metrics.mean_squared_error(ozone_test['ozone'], ozone_ridge_pred))
ozone_ridge_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ridge_pred)
Exemplo n.º 57
0
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))



# # Using regularized methods for regression




lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
print(lasso.coef_)




print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))


# Ridge regression:
Exemplo n.º 58
0
def lassoCVPath(X, y):
    model = LassoCV(cv=10, verbose=False, eps=1e-5)
    model.fit(X, y)
    fig_path = plt.figure()
    ax_path = fig_path.add_subplot(1, 1, 1)

    pred_error = np.mean(np.sqrt(model.mse_path_) * 1000.0, axis=1)
    min_indx = np.argmin(pred_error)
    x_ax = np.log10(model.alphas_)

    ax_path.plot(x_ax, pred_error, color='#7c6868', label="CV")
    ax_path.axvline(np.log10(model.alphas_[min_indx]),
                    ls='--',
                    color='#7c6868')

    # Calculate AIC
    aicc_vals = np.zeros(len(model.alphas_))
    bic_vals = np.zeros_like(aicc_vals)
    nonzeros = []
    for i in range(len(model.alphas_)):
        m = Lasso(alpha=model.alphas_[i])
        m.fit(X, y)
        coeff = m.coef_
        nonzero = np.nonzero(coeff)[0]
        pred = m.predict(X)
        rss = np.sum((pred - y)**2)
        if rss < 1e-12:
            rss = 1e-12
        numCoeff = len(nonzero)
        nonzeros.append(nonzero)
        print(numCoeff, np.sqrt(rss / len(pred)), model.alphas_[i])
        aicc_vals[i] = aicc(numCoeff, X.shape[0], rss)
        bic_vals[i] = bic(numCoeff, X.shape[0], rss)

    ax_path2 = ax_path.twinx()
    ax_path2.plot(x_ax, aicc_vals, color='#b63119', label="AICc")

    min_indx = np.argmin(aicc_vals)
    ax_path.axvline(x_ax[min_indx], ls='--', color='#b63119')

    ax_path2.plot(x_ax, bic_vals, color='#cb9f52', label="BIC")

    min_indx = np.argmin(bic_vals)
    ax_path.axvline(x_ax[min_indx], ls='--', color='#cb9f52')
    ax_path.legend(frameon=False)

    ax_path.set_ylabel("CV (meV/atom)")
    ax_path.set_xlabel("log \$\\lambda\$")
    ax_path2.legend(frameon=False)
    ax_path2.set_ylabel("AICc/BIC")
    #ax_path2.set_ylim([-30000, -16500])

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(1, 1, 1)

    for i, non in enumerate(nonzeros):
        x = [np.log10(model.alphas_[i]) for _ in range(len(non))]
        ax2.plot(x,
                 non,
                 ls='none',
                 marker='o',
                 mfc='none',
                 color='#7c6868',
                 markersize=1.5)

    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.set_xlabel("log \$\\lambda\$")
    ax2.set_ylabel("Feature no.")
    plt.show()
Exemplo n.º 59
0
    def run(self):
        # Model selection phase
        internal_k = self._params['internal_k']

        # Perform k splits once
        skf = StratifiedKFold(self._Ytr, n_folds=internal_k)

        # store the mean accuracies for each model
        acc_list = np.empty((len(self._params['tau_range']),))

        TAU_MAX = self.get_l1_bound()
        # print("TAU_MAX = {}".format(TAU_MAX))

        for i, tau_scaling in enumerate(self._params['tau_range']):
            tau = TAU_MAX * tau_scaling
            # print("{}-th value of tau ({})".format(i+1, tau))

            acc = 0
            # number of solutions which consisted of only zeros
            # (early stopping for too big tau)
            N_allzeros = 0

            for idx_tr, idx_ts in skf:
                Xk_tr = self._Xtr[idx_tr, :]
                Xk_ts = self._Xtr[idx_ts, :]

                Yk_tr = self._Ytr[idx_tr]
                Yk_ts = self._Ytr[idx_ts]

                clf = Lasso(alpha=tau)
                clf.fit(Xk_tr, Yk_tr)  # fit the model

                # extract only nonzero coefficients
                selected_features = np.argwhere(clf.coef_).ravel()
                # print("Selected {} features".format(len(selected_features)))

                if len(selected_features) == 0:
                    # If no features are selected, just assign all samples to
                    # the most common class (in the training set)
                    N_allzeros += 1
                    Yk_lr = np.ones((len(Yk_ts),)) * np.sign(Yk_tr.sum() + 0.1)

                else:
                    # Else, run OLS and get weights for coefficients NOT
                    # affected by shrinking
                    Xk_tr2 = Xk_tr[:, selected_features]
                    Xk_ts2 = Xk_ts[:, selected_features]

                    clf = LinearRegression(normalize=False)
                    clf.fit(Xk_tr2, Yk_tr)  # fit the model
                    Yk_lr = clf.predict(Xk_ts2)  # predict test data
                    Yk_lr = np.sign(Yk_lr)  # take the sign

                acc += accuracy_score(Yk_ts, Yk_lr)

            acc_list[i] = acc / internal_k

            if N_allzeros == internal_k:
                # All k-fold splits returned empty solutions, stop here as
                # bigger values of tau would return empty solutions as well
                print("The {}-th value of tau ({}) returned only empty "
                      "solutions".format(i + 1, tau))
                break

        # Final train with the best choice for tau
        best_tau_idx = np.argmax(acc_list)
        # best_tau = self._params['tau_range'][best_tau_idx]
        best_tau = self._params['tau_range'][best_tau_idx] * TAU_MAX

        clf = Lasso(alpha=best_tau)
        clf.fit(self._Xtr, self._Ytr)  # fit the model

        # extract only nonzero coefficients
        # selected_features = np.argwhere(clf.coef_)[0]
        selected_features = np.argwhere(clf.coef_).ravel()

        if len(selected_features) == 0:
            print("WARNING: the allegedly best solution (tau = {}) was "
                  " empty".format(best_tau))

            sign = np.sign(np.sum(self._Ytr) + 0.1)
            Y_lr = np.ones((len(self._Yts)),) * sign
            Y_lr_tr = np.ones((len(self._Ytr)),) * sign

        else:
            X_tr2 = self._Xtr[:, selected_features]
            X_ts2 = self._Xts[:, selected_features]

            clf = LinearRegression()
            clf.fit(X_tr2, self._Ytr)  # fit the model

            Y_lr = clf.predict(X_ts2)  # predict test data
            Y_lr = np.sign(Y_lr)  # take the sign

            Y_lr_tr = clf.predict(X_tr2)  # predict training data
            Y_lr_tr = np.sign(Y_lr_tr)  # take the sign

        result = {}
        result['selected_list'] = selected_features
        # result['beta_list'] = result['beta_list'][0]
        result['prediction_ts_list'] = Y_lr
        result['prediction_tr_list'] = Y_lr_tr
        result['labels_ts'] = self._Yts
        return result
Exemplo n.º 60
0
    pred_test_LGB = myLGB.predict(X_test)

    # Stacking
    stackedset = pd.DataFrame({'A': []})
    stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_l2)], axis=1)
    stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_l1)], axis=1)
    stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_GBR)], axis=1)
    stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_ENet)], axis=1)
    stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_LGB)], axis=1)
    # prod = (pred_test_l2*pred_test_l1*pred_test_GBR*pred_test_ENet*pred_test_LGB) ** (1.0/5.0)
    # stackedset = pd.concat([stackedset,pd.DataFrame(prod)],axis=1)
    Xstack = np.array(stackedset)
    Xstack = np.delete(Xstack, 0, axis=1)
    l1_staked = Lasso(alpha=0.0001, fit_intercept=True)
    l1_staked.fit(Xstack, y_test)
    pred_test_stack = l1_staked.predict(Xstack)
    models.append([l2Regr, l1Regr, myGBR, ENet, myLGB, l1_staked])

    # 模型预测
    X_score = np.array(df_score)
    X_score = np.delete(X_score, 0, 1)
    M = X_score.shape[0]
    scores_fin = 1 + np.zeros(M)
    for m in models:
        ger = m[0]
        las = m[1]
        gbr = m[2]
        Enet = m[3]
        lgb = m[4]
        las2 = m[5]
        ger_predict = ger.predict(X_score)