Пример #1
0
 def lasso(self,training,target,feature_index_list):
     clf=Lasso(self.alpha,fit_intercept=False)
     clf.fit(training,target)
     coef=np.zeros(self.n_features)
     for index,feature_index in enumerate(feature_index_list):
         coef[feature_index]=clf.coef_[index]
     return coef
Пример #2
0
def lasso_regression(features, solutions, verbose=0):
    columns = solutions.columns

    clf = Lasso(alpha=1e-4, max_iter=5000)

    print('Training Model... ')
    clf.fit(features, solutions)
    
    feature_coeff = clf.coef_
    features_importances = np.zeros((169, 3))
    for idx in range(3):
        features_importance = np.reshape(feature_coeff[idx, :], (169, 8))
        features_importance = np.max(features_importance, axis=1)
        features_importances[:, idx] = features_importance
        
    features_importance_max = np.max(features_importances, axis=1)
    features_importance_max = np.reshape(features_importance_max, (13, 13))
    plt.pcolor(features_importance_max)
    plt.title("Feature importance for HoG")
    plt.colorbar()
    plt.xticks(arange(0.5,13.5), range(1, 14))
    plt.yticks(arange(0.5,13.5), range(1, 14))
    plt.axis([0, 13, 0, 13])
    plt.show()
    
    print('Done Training')
    return (clf, columns)
    def train(self, x, y, param_names, random_search=100, **kwargs):
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        alpha = self._random_search(random_iter=100, x=scaled_x, y=y)

        # Now train model
        lasso = Lasso(alpha=alpha,
                      fit_intercept=True,
                      normalize=False,
                      precompute='auto',
                      copy_X=True,
                      max_iter=1000,
                      tol=0.0001,
                      warm_start=False,
                      positive=False)

        lasso.fit(scaled_x, y)
        self._model = lasso

        duration = time.time() - start
        self._training_finished = True
        return duration
def reg_skl_lasso(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    lasso = Lasso(alpha=param["alpha"], normalize=True)
    lasso.fit(X_tr, y_reg_tr)
    pred = lasso.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
Пример #5
0
def lassoreg(a):
    print ("Doing lasso regression")
    clf2 = Lasso(alpha=a)
    clf2.fit(base_X, base_Y)
    print ("Score = %f" % clf2.score(base_X, base_Y))
    clf2_pred = clf2.predict(X_test)
    write_to_file("lasso.csv", clf2_pred)
Пример #6
0
def traverse_movies_lasso():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 3695:
			model = Lasso(alpha = .05)
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			#P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	#print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
Пример #7
0
def precision_recall_samples(X, y):
    pr_lasso = precision_recall(support.T[-1], lasso_coefs(X, y))
    stability = stability_selection(X, y, pi=None)
    estimated = []
    for st in np.unique(stability):
        estimated.append(stability > st - 1.e-12)
    pr_ss = precision_recall(support.T[-1], estimated)

    n_samples, n_features = X.shape
    alpha_max = np.max(np.dot(y, X)) / n_samples
    alpha = .1 * alpha_max
    clf = Lasso(alpha=alpha)
    abs_coef = np.abs(clf.fit(X, y).coef_)
    estimated = []
    for th in np.unique(abs_coef):
        estimated.append(abs_coef > th - 1.e-12)

    pr_pt = precision_recall(support.T[-1], estimated)
    clf = BootstrapLasso(alpha=alpha, n_bootstraps=n_bootstraps)
    abs_coef = np.abs(clf.fit(X, y).coef_)
    estimated = []
    for th in np.unique(abs_coef):
        estimated.append(abs_coef > th - 1.e-12)

    pr_bpt = precision_recall(support.T[-1], estimated)
    return pr_lasso, pr_ss, pr_pt, pr_bpt
Пример #8
0
    def __init__(self, penalty='l1', dual=None, C=None, alpha=None):

        self.l1 = True if penalty=="l1" else False
        if self.l1:
            Lasso.__init__(self, alpha=alpha)
        else:
            Ridge.__init__(self, alpha=alpha)
Пример #9
0
    def RunLASSOScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      inputData = np.genfromtxt(self.dataset[0], delimiter=',')
      responsesData = np.genfromtxt(self.dataset[1], delimiter=',')

      # Get all the parameters.
      lambda1 = re.search("-l (\d+)", options)
      lambda1 = 0.0 if not lambda1 else int(lambda1.group(1))
          
      try:
        with totalTimer:
          # Perform LASSO.
          model = Lasso()
          model.fit(inputData, responsesData)
          out = model.coef_
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Пример #10
0
class SparseSelector(BaseEstimator):
    """
    Sparse L1 based feature selection.  Parameters are passed onto
    sklearn.linear_model.Lasso, which actually does the work.
    """
    def __init__(self, alpha=1.0, fit_intercept=True, 
                 normalize=False):
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.lasso = None

    def fit(self, X, y):
        self.lasso = Lasso(alpha=self.alpha, 
                           fit_intercept=self.fit_intercept,
                           normalize=self.normalize)
        self.lasso.fit(X, y)
        return self
        
    def transform(self, X):
        cols = np.nonzero(self.lasso.sparse_coef_)[1]
        if sp.sparse.issparse(X):
            return X.tocsc()[:, cols]
        else:
            return X[:, cols]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
Пример #11
0
    def fit(self, sklearn_alpha=None, **lasso_args):
        """
        Fit the lasso using `Lasso` from `sklearn`.
        This sets the attribute `soln` and
        forms the constraints necessary for post-selection inference
        by calling `form_constraints()`.

        Parameters
        ----------

        sklearn_alpha : float
            Lagrange parameter, in the normalization set by `sklearn`.

        lasso_args : keyword args
             Passed to `sklearn.linear_model.Lasso`_

        Returns
        -------

        soln : np.float
             Solution to lasso with `sklearn_alpha=self.lagrange`.
             
        
        """

        # fit Lasso using scikit-learn
        
        clf = Lasso(alpha = self.lagrange, fit_intercept = False)
        clf.fit(self.X, self.y, **lasso_args)
        self._soln = beta = clf.coef_       
        if not np.all(beta == 0):
            self.form_constraints()
        else:
            self.active = []
        return self._soln
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
Пример #13
0
def lassoRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Lasso Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    lassoRegression = Lasso(alpha=1e-7)
    lassoRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = lassoRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-lassoRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Пример #14
0
    def build_linmodel(self):
        x = 'nba_15season_all_150928.csv'
        self.train, self.test, self.id_df = lin_encode(
                                               filename = self.name,
                                               min_cutoff = self.min_cutoff,
                                               TRANSFORM_CUTOFF = self.t_cutoff)
        #Create validation set
        np.random.seed(2)
        self.test = self.test.reindex(np.random.permutation(self.test.index))
        self.test = self.test.iloc[:self.test.shape[0]/2,:]
        self.test = self.test.reset_index().drop('index', axis = 1)

        """
        # TESTING GROUNDS
        bestcol = [...put list of columns here for testing features...]
        bestcol = bestcol + ['points']
        """
        # Use these next two lines if you want to filter out these aggregates
        bestcol = np.logical_not(self.train.columns.str.contains(
                                                                '_std|_max|'+\
                                                                '_min|_5std|'+\
                                                                '_5max|_5min|'+\
                                                                'Unnamed'))
        bestcol = self.train.columns[bestcol]

        # Subset of features you want to train on
        self.train = self.train[bestcol]
        self.test = self.test[bestcol]

        print self.train.shape, self.test.shape
        ###
        print 'train shape and test shape', self.train.shape, self.test.shape
        X = self.train.as_matrix(self.train.columns[:-1]).astype(float)
        y = self.train.as_matrix(['points'])[:, 0].astype(float)
        X_test = self.test.as_matrix(self.test.columns[:-1]).astype(float)
        self.y_test = self.test.as_matrix(['points'])[:, 0].astype(float)

        # Choose which type of linear regression to test
        if self.lintype == 'Linear':
            self.lr = LinearRegression(**self.params)
        elif self.lintype == 'Ridge':
            self.lr = Ridge(**self.params)
        elif self.lintype == 'Lasso':
            self.lr = Lasso(**self.params)
        else:
            return "Error: Choose lin. reg. type: 'Linear', 'Ridge', 'Lasso'"
        self.lr.fit(X, y)
        self.y_pred = self.lr.predict(X_test)

        error = mean_squared_error(self.y_pred, self.y_test)**0.5
        print 'RMSE:', error

        # Getting attributes from LinearRegression()
        coef = self.lr.coef_
        self.coef_imp = pd.DataFrame({'feature': self.train.columns[:-1],
                                      'coefficient': coef})
        self.coef_imp = self.coef_imp.sort('coefficient', ascending = False)
        self.coef_imp = self.coef_imp.reset_index().drop('index', axis = 1)
        self.intercept = self.lr.intercept_
Пример #15
0
    def classify(self):
        """Perform classification"""
        clf = Lasso(max_iter=10000000)
        #parameters = {'alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1,5.0,10.0]}
        #clf = GridSearchCV(lasso, parameters,scoring='roc_auc')

        clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
        self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def varselect_w_lass(all_vars_list, selected_vars, alpha_val):
    lass = Lasso(alpha=alpha_val,
                 positive=True, max_iter=100000 , tol=.0001)
    lass.fit(np.array(fire_train_TRAIN_smp[all_vars_list]),
             np.array(fire_train_TRAIN_smp.target ))
    for x in range(1, len(all_vars_list)):
        if lass.coef_[x]> .00000001:
            selected_vars.append(all_vars_list[x])
def trainModel(x, y, degree=1):
    """Self designed Explicit method to train the model using linear regression."""
    #poly = PolynomialFeatures(degree)
    #z = poly.fit_transform(x)
    #return np.dot(np.linalg.pinv(z), y)
    clf = Lasso(alpha=.5)
    clf.fit(x, y)
    return clf
def lasso(data, targets):
    """
    Returns a Lasso linear model for predictions with alpha 0.1
    Takes the data and the associated targets as arguments.
    """
    model = Lasso(alpha=0.1)
    model.fit(data, targets)
    return model
Пример #19
0
def weight_analysis(verbose=0, stack_option='s'):
  logging.info('starting ensemble weight analysis')

  stack = STACK if stack_option == 's' else MODELS

  pool = multiprocessing.Pool(processes=4)
  drivers = settings.DRIVER_IDS#[:1000]
  CUTOFF = -1
  results = pool.map(
      compute_weights,
      map(lambda x: (x, verbose, stack_option), drivers)
  )

  predictions = {}
  for i, get_data, model, _ in stack:
    predictions[i] = np.array(list(itertools.chain(*[r[1][i] for r in results])))
  testY = list(itertools.chain(*[r[2] for r in results]))

  model_names = [
      ('%s.%s.%s' % (get_data.func_name, model.__name__, i), i)
      for i, get_data, model, repeat in stack
  ]
  model_names.sort(key=lambda x: x[0])
  keys = [x[1] for x in model_names]
  model_names = [x[0] for x in model_names]

  lasso = Lasso(alpha=0.0, positive=True)
  trainX = []
  for row_id in xrange(len(testY)):
    train_row = [predictions[i][row_id] for i in keys]
    trainX.append(train_row)

  a, b = trainX[:CUTOFF], trainX[CUTOFF:]
  c, d = testY[:CUTOFF], testY[CUTOFF:]
  lasso.fit(a, c)
  pred = lasso.predict(b)
  pred_train = lasso.predict(a)
  #logging.info('auc: %s' % util.compute_auc(d, pred))

  logging.info('coefficients:')
  weights = {}
  for i, name in enumerate(model_names):
    logging.info('%s: %.3f' % (model_names[i], lasso.coef_[i]))
    weights[keys[i]] = lasso.coef_[i]

  logging.info('individual scores:')
  for i, key in enumerate(keys):
    logging.info('%s: %.3f' % (
        model_names[i],
        util.compute_auc(testY, predictions[key])
    ))

  logging.info('weights dictionary: %s' % weights)

  # and again in the end, so you don't have to scroll
  logging.info('------------')
  #logging.info('auc: %s' % util.compute_auc(d, pred))
  logging.info('auc train: %s' % util.compute_auc(c, pred_train))
def Lasso_Regression(kf,data,label,k):
	val=0
	for train, test in kf:
		X_train, X_test, y_train, y_test = data[train,:], data[test,:], label[train], label[test]
		log =  Lasso(alpha=0.1)
		logit = log.fit(X_train,y_train)
		y_pred =  logit.predict(X_test)
		val+= metrics.mean_squared_error(y_test, y_pred)  
	return val/3
Пример #21
0
def comparaison_ridge_lasso(X,Y):
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
    clf_lasso = Lasso(selection='random', random_state=random.seed())
    clf_ridge = Ridge()
    clf_lasso.fit(X_train,Y_train)
    clf_ridge.fit(X_train,Y_train)
    score_lasso=clf_lasso.score(X_test,Y_test)
    score_ridge=clf_ridge.score(X_test,Y_test)
    print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
def trainModel_phase2(x, y, degree=1):
    """Self designed Explicit method to train the model using linear regression."""
    #poly = PolynomialFeatures(degree)
    #z = poly.fit_transform(x)
    #return np.dot(np.linalg.pinv(z), y)
    #clf = BernoulliRBM()
    #clf = LinearRegression()
    clf = Lasso(alpha=.5)
    clf.fit(x.reshape(-1, 1), y)
    return clf
Пример #23
0
class Linear():
    def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \
            epsilon=0.1):
        self.limit = limit
        if type == 'Ridge':
            self.model = Ridge(alpha=alpha)
        elif type == 'SVR':
            self.model = SVR(kernel='linear', C=C, epsilon=epsilon)
        elif type == 'NuSVR':
            self.model = NuSVR(C=C, nu=nu, kernel='linear')
        elif type == 'Lasso':
            self.model = Lasso(alpha=alpha)
        
    @staticmethod
    def get_cal(m):
        # get calitative features
        # watch out as indices depend on feature vector!
        return np.hstack((m[:,:23], m[:,24:37], m[:,38:52])) + 1
    
    @staticmethod
    def get_cant(m):
        # get cantitative features
        # watch out as indices depend on feature vector!
        return np.hstack((m[:,23:24], m[:,37:38], m[:,52:]))
        
    def fit(self, train_X, train_Y):
        # no fitting done here, just saving data
        if self.limit:
            if len(train_X) > self.limit:
                train_X = train_X[-self.limit:]
                train_Y = train_Y[-self.limit:]
        self.train_X = np.array(train_X)
        self.train_Y = np.array(train_Y)
        
        
    def predict(self, test_X):
        # fitting done here
        # not efficient on the long term
        test_X = np.array(test_X)
        enc = OneHotEncoder()
        scal = MinMaxScaler()
        data = np.vstack((self.train_X, test_X))
        enc.fit(self.get_cal(data))
        scal.fit(self.get_cant(data))
        
        new_train_X1 = enc.transform(self.get_cal(self.train_X))
        new_train_X2 = scal.transform(self.get_cant(self.train_X))
        new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2))
        new_test_X1 = enc.transform(self.get_cal(test_X))
        new_test_X2 = scal.transform(self.get_cant(test_X))
        new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2))
        
        self.model.fit(new_train_X, self.train_Y)
        R = self.model.predict(new_test_X)
        return R
Пример #24
0
def test_lasso_regression():
	datafile_viper = '../data_viper/viper.pkl'
	viper = loadfile(datafile_viper)

	from sklearn.linear_model import Lasso

	model = Lasso(alpha=1e-3)
	model.fit(viper.train_feat, viper.train_y)

	y_pred = model.predict(viper.test_feat)
	print 'testing error {}'.format(abs_error(y_pred, viper.test_y)) 
Пример #25
0
 def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \
         epsilon=0.1):
     self.limit = limit
     if type == 'Ridge':
         self.model = Ridge(alpha=alpha)
     elif type == 'SVR':
         self.model = SVR(kernel='linear', C=C, epsilon=epsilon)
     elif type == 'NuSVR':
         self.model = NuSVR(C=C, nu=nu, kernel='linear')
     elif type == 'Lasso':
         self.model = Lasso(alpha=alpha)
def fit_predict_model(l1_penalty):
    RSS = np.zeros((len(l1_penalty)))
    num_nonzero_coeff = np.zeros((len(l1_penalty)))
    idx = 0
    for l1_penalty_choice in l1_penalty:
        model = Lasso(alpha=l1_penalty_choice, normalize=True)
        model.fit(training[all_features], training['price'])
        predicted_price = model.predict(validation[all_features])
        RSS[idx] = np.sum((predicted_price - validation['price'])**2)
        num_nonzero_coeff[idx] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
        idx += 1
    return (RSS, num_nonzero_coeff, model)
def main(folds = 5):
    print "folds: ", folds
    #read in  data, parse into training and target sets
    print "\n ------------------Load file --------------- \n"
    train = np.loadtxt(sys.argv[1]).T
    min_max_scaler = preprocessing.MinMaxScaler()
    train = min_max_scaler.fit_transform(train)
	#test data set
    xtest = train[100:112, :]
    train = train[0:100, :]
    print "Size of read data: ", train.shape
    #train = imputation_missingValue(train)
    print "After Standardization:"
    print train
  
    target = np.loadtxt(sys.argv[2]).T
    ytest = target[100:112, :]
    target = target[0:100,:]
    print "Size of read data: ", target.shape

    al = 0.3
    rf = Lasso(alpha=al)
	
    #Simple K-Fold cross validation.
    cv = cross_validation.KFold(len(train), folds)
    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    i = 0
    min_MSE = sys.maxint
    best_train = -1
    best_test = -1
    for traincv, testcv in cv:
        start = timeit.default_timer()
        i += 1
        print i, "epoch"
        rf.fit(train[traincv], target[traincv])
        prediction = rf.predict(train[testcv])
        MSE = mean_squared_error(target[testcv], prediction)
        print "MSE: ", MSE, " for ",i
        if min_MSE > MSE:
            best_train = traincv
            best_test = testcv
            min_MSE = MSE
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
        results.append(MSE)
        stop = timeit.default_timer()
	print "Program running time: ", stop - start 
    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() ), "for folds: ", folds
    print "Results for independent data: ", mean_squared_error(rf.fit(train[best_train], target[best_train]).predict(xtest), ytest)
    print "R squared:"
    print "alpha:", al
Пример #28
0
def lasso_regression(alpha):
    #Fit the model
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
    lassoreg.fit(A_x, A_y)
    y_pred = lassoreg.predict(A_x)
    
   #Return the result in pre-defined format
    rss = sum((y_pred-A_y)**2)
    ret = [rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret
def lasso_regression(data, predictors, alpha):
    #Fit the model
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
    lassoreg.fit(data[predictors],data['TransformedLife'])
    y_pred = lassoreg.predict(data[predictors])
    
    #Return the result in pre-defined format
    rss = sum((y_pred-data['TransformedLife'])**2)
    ret = [rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret
Пример #30
0
def basispursuit(y, F, penalty=0.1):
    """
    solves basic (vanilla) basis pursuit using scikit-learn
    """

    clf = Lasso(alpha=penalty, fit_intercept=False)
    clf.fit(F, y)
    xhat = clf.coef_

    # reconstruct
    yhat = F.dot(xhat)

    return xhat, yhat
Пример #31
0
    def fit(self, K, s):
        r"""Fit the model using the coordinate descent method from scikit-learn.

        Args
        ----

        K: ndarray
            The :math:`m \times n` kernel matrix, :math:`{\bf K}`. A numpy array of
            shape (m, n).
        s: ndarray or CSDM object.
            A csdm object or an equivalent numpy array holding the signal,
            :math:`{\bf s}`, as a :math:`m \times m_\text{count}` matrix.
        """
        s_, self.scale = prepare_signal(s)

        prod = np.asarray(self.f_shape).prod()
        if K.shape[1] != prod:
            raise ValueError(
                "The product of the shape, `f_shape`, must be equal to the length of "
                f"the axis 1 of kernel, K, {K.shape[1]} != {prod}.")

        alpha = s_.size * self.hyperparameters["alpha"]
        Ks, ss = _get_augmented_data(K=K,
                                     s=s_,
                                     alpha=alpha,
                                     regularizer=self.regularizer,
                                     f_shape=self.f_shape)

        # The factor 0.5 for alpha in the Lasso/LassoLars problem is to compensate
        # 1/(2 * n_sample) factor in OLS term
        if self.method == "multi-task":
            estimator = MultiTaskLasso(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                copy_X=True,
                max_iter=self.max_iterations,
                tol=self.tolerance,
                warm_start=False,
                random_state=None,
                selection="random",
                # positive=self.positive,
            )

        if self.method == "gradient_decent":
            estimator = Lasso(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                copy_X=True,
                max_iter=self.max_iterations,
                tol=self.tolerance,
                warm_start=False,
                random_state=None,
                selection="random",
                positive=self.positive,
            )

        if self.method == "lars":
            estimator = LassoLars(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                verbose=True,
                # normalize=False,
                precompute=True,
                max_iter=self.max_iterations,
                eps=2.220446049250313e-16,
                copy_X=True,
                fit_path=False,
                positive=True,
                jitter=None,
                random_state=None,
            )

        estimator.fit(Ks, ss)
        f = estimator.coef_.copy()
        if s_.shape[1] > 1 and len(self.f_shape) == 2:
            f.shape = (s_.shape[1], ) + self.f_shape
            f[:, :, 0] /= 2.0
            f[:, 0, :] /= 2.0
        elif s_.shape[1] == 1 and len(self.f_shape) == 2:
            f.shape = self.f_shape
            f[:, 0] /= 2.0
            f[0, :] /= 2.0

        f *= self.scale
        self.estimator = estimator
        self.f = f
        self.n_iter = estimator.n_iter_
        self._sol_to_csdm(s)
Пример #32
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('sense_input', help='the input sense mapping matrix')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('tsns_output',
                        default='tsns.pkl',
                        help='the output target senses pickle file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--future',
                                  action='store_true',
                                  help='experiment with stuff')
    recommended_type.add_argument('--toy',
                                  action='store_true',
                                  help='experiment with stuff on toy dataset')
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log',
        default='map.log',
        help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')

    future_group = parser.add_argument_group('experimental arguments',
                                             'Experimental arguments')
    future_group.add_argument('--skip_top',
                              type=int,
                              default=0,
                              help='Top k words to skip, presumably function')
    future_group.add_argument(
        '--start_src',
        action='store_true',
        help='Algorithm starts by tuning sense embeddings based on source')
    future_group.add_argument('--trim_senses',
                              action='store_true',
                              help='Trim sense table to working vocab')
    future_group.add_argument(
        '--lamb',
        type=float,
        default=0.5,
        help='Weight hyperparameter for sense alignment objectives')
    future_group.add_argument('--reglamb',
                              type=float,
                              default=1.,
                              help='Lasso regularization hyperparameter')
    future_group.add_argument(
        '--ccreglamb',
        type=float,
        default=0.1,
        help='Sense embedding regularization hyperparameter')
    future_group.add_argument('--inv_delta',
                              type=float,
                              default=0.0001,
                              help='Delta_I added for inverting sense matrix')
    future_group.add_argument('--lasso_iters',
                              type=int,
                              default=10,
                              help='Number of iterations for LASSO/NMF')
    future_group.add_argument('--iterations',
                              type=int,
                              default=-1,
                              help='Number of overall model iterations')
    future_group.add_argument('--trg_batch',
                              type=int,
                              default=5000,
                              help='Batch size for target steps')
    future_group.add_argument(
        '--trg_knn',
        action='store_true',
        help='Perform target sense mapping by k-nearest neighbors')
    future_group.add_argument(
        '--trg_sns_csls',
        type=int,
        default=10,
        help='K-nearest neighbors for CSLS target sense search')
    future_group.add_argument(
        '--senses_per_trg',
        type=int,
        default=1,
        help='K-max target sense mapping (default = 1 = off)')
    future_group.add_argument(
        '--gd',
        action='store_true',
        help='Apply gradient descent for assignment and synset embeddings')
    future_group.add_argument('--gd_lr',
                              type=float,
                              default=1e-2,
                              help='Learning rate for SGD (default=0.01)')
    future_group.add_argument('--gd_wd',
                              action='store_true',
                              help='Weight decay in SGD')
    future_group.add_argument(
        '--gd_wd_hl',
        type=int,
        default=100,
        help='Weight decay half-life in SGD, default=100')
    future_group.add_argument(
        '--gd_clip',
        type=float,
        default=5.,
        help='Per-coordinate gradient clipping (default=5)')
    future_group.add_argument(
        '--gd_map_steps',
        type=int,
        default=1,
        help='Consecutive steps for each target-sense mapping update phase')
    future_group.add_argument(
        '--gd_emb_steps',
        type=int,
        default=1,
        help='Consecutive steps for each sense embedding update phase')
    future_group.add_argument(
        '--base_prox_lambda',
        type=float,
        default=0.99,
        help='Lambda for proximal gradient in lasso step')
    future_group.add_argument(
        '--prox_decay',
        action='store_true',
        help='Multiply proximal lambda by itself each iteration')
    future_group.add_argument(
        '--sense_limit',
        type=float,
        default=1.1,
        help=
        'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)'
    )
    future_group.add_argument(
        '--gold_pairs',
        help='Gold data for evaluation, if exists (not for tuning)')
    future_group.add_argument(
        '--gold_threshold',
        type=float,
        default=0.0,
        help='Threshold for gold mapping (0 is fine if sparse)')

    future_group.add_argument('--debug', action='store_true')

    args = parser.parse_args()

    # pre-setting groups
    if args.toy:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=50,
                            trim_senses=True,
                            inv_delta=1.,
                            reglamb=0.2,
                            lasso_iters=100,
                            gd_wd=True,
                            log='map-toy.log')
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=2000,
                            trim_senses=True,
                            gd_wd=True)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=20000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'  # many operations not supported by cupy
    elif args.precision == 'fp32':  # default
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # Read input source sense mapping
    print('reading sense mapping')
    src_senses = pickle.load(open(args.sense_input, 'rb'))
    if src_senses.shape[0] != x.shape[0]:
        src_senses = csr_matrix(src_senses.transpose()
                                )  # using non-cuda scipy because of 'inv' impl
    #src_senses = get_sparse_module(src_senses)
    print(
        f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros'
    )

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # removed word to index map (only relevant in supervised learning or with validation)

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # removed building the seed dictionary

    # removed validation step

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory

    # Initialize the projection matrices W(s) = W(t) = I.
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    xw[:] = x
    zw[:] = z

    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0] - args.skip_top, args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0] - args.skip_top, args.vocabulary_cutoff)
    emb_dim = x.shape[1]

    cutoff_end = min(src_size + args.skip_top, x.shape[0])

    if args.trim_senses:
        # reshape sense assignment
        src_senses = src_senses[args.skip_top:cutoff_end]

        # new columns for words with no senses in original input
        ### TODO might also need this if not trimming (probably kinda far away)
        newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\
                   if src_senses.getrow(i).getnnz() == 0]
        #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file:
        #    dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0]
        #    pickle.dump(np.array(dummy_col_idcs), dummy_cols_file)

        # trim senses no longer used, add new ones
        colsums = src_senses.sum(axis=0).tolist()[0]
        kept_senses = [i for i, j in enumerate(colsums) if j > 0]
        #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file:
        #    pickle.dump(np.array(kept_senses), kept_save_file)
        src_senses = hstack([src_senses[:, kept_senses]] + newcols)
        print(
            f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros'
        )
    sense_size = src_senses.shape[1]

    if args.gold_pairs is not None:
        with open(args.gold_pairs, 'rb') as gold_pairs_f:
            gold_pairs = pickle.load(gold_pairs_f)
            gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \
                          if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]]
        gold_trgs = sorted(set([x[0] for x in gold_pairs]))
        gold_senses = sorted(set([x[1] for x in gold_pairs]))
        gold_domain_size = len(gold_trgs) * len(gold_senses)
        print(
            f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses'
        )

    # Initialize the concept embeddings from the source embeddings
    ### TODO maybe try gradient descent instead?
    ### TODO (pre-)create non-singular alignment matrix
    cc = xp.empty((sense_size, emb_dim), dtype=dtype)  # \tilde{E}
    t01 = time.time()
    print('starting psinv calc')
    src_sns_psinv = psinv(src_senses, dtype, args.inv_delta)
    xecc = x[args.skip_top:cutoff_end].T.dot(
        get_sparse_module(src_senses).toarray()).T  # sense_size * emb_dim
    cc[:] = src_sns_psinv.dot(xecc)
    print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds',
          file=sys.stderr)
    if args.verbose:
        # report precision of psedo-inverse operation, checked by inverting
        pseudo_id = src_senses.transpose().dot(src_senses).dot(
            src_sns_psinv.get())
        real_id = sparse_id(sense_size)
        rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size)
        print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}')

    ### TODO initialize trg_senses using seed dictionary instead?
    trg_sns_size = trg_size if args.trim_senses else z.shape[0]
    trg_senses = csr_matrix(
        (trg_sns_size,
         sense_size))  # using non-cuda scipy because of 'inv' impl
    zecc = xp.empty_like(xecc)  # sense_size * emb_dim
    #tg_grad = xp.empty((trg_sns_size, sense_size))

    if args.gd:
        # everything can be done on gpu
        src_senses = get_sparse_module(src_senses, dtype=dtype)
        trg_senses = get_sparse_module(trg_senses, dtype=dtype)
        if args.sense_limit > 0.0:
            trg_sense_limit = int(args.sense_limit * src_senses.getnnz())
            if args.verbose:
                print(
                    f'limiting target side to {trg_sense_limit} sense mappings'
                )
        else:
            trg_sense_limit = -1

    ### TODO return memory assignment for similarities?

    # Training loop
    if args.gd:
        prox_lambda = args.base_prox_lambda
    else:
        lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\
                            positive=True, warm_start=True)  # TODO more parametrization

    if args.log is not None:
        if args.gd:
            print(f'gradient descent lr: {args.gd_lr}', file=log)
            print(f'base proximal lambda: {args.base_prox_lambda}', file=log)
        else:
            print(f'lasso regularization: {args.reglamb}', file=log)
            print(f'lasso iterations: {args.lasso_iters}', file=log)
            print(f'inversion epsilon: {args.inv_delta}', file=log)
        if args.gold_pairs is not None:
            print(f'gold mappings: {len(gold_pairs)}', file=log)
        print(
            f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings',
            file=log)
        log.flush()

    best_objective = objective = 1000000000.
    correct_mappings = -1
    regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb
    it = 1
    last_improvement = 0
    t = time.time()
    map_gd_lr = args.gd_lr
    emb_gd_lr = args.gd_lr
    end = False
    print('starting training')

    if args.start_src:
        print('starting with converging synset embeddings')
        it_range = range(
            args.iterations
        )  ### TODO possibly add arg, but there's early stopping
        if not args.verbose:
            it_range = tqdm(it_range)
        prev_obj = float('inf')
        for pre_it in it_range:
            if args.gd_wd:
                emb_gd_lr = args.gd_lr * pow(0.5, floor(
                    pre_it / args.gd_wd_hl))

            # Synset embedding
            cc_grad = src_senses.T.dot(
                xw[args.skip_top:cutoff_end] -
                src_senses.dot(cc)) - args.ccreglamb * cc
            cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
            cc += emb_gd_lr * cc_grad

            # Source projection
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)

            pre_objective = ((xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
            pre_objective = float(pre_objective)

            if args.verbose and pre_it > 0 and pre_it % 10 == 0:
                print(
                    f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}'
                )

            if pre_objective > prev_obj:
                print(
                    f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}'
                )
                # revert
                cc -= emb_gd_lr * cc_grad
                break

            prev_obj = pre_objective

    while True:
        if it % 50 == 0:
            print(
                f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}'
            )

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            last_improvement = it

        if args.iterations > 0 and it > args.iterations:
            end = True

        ### update target assignments (6) - lasso-esque regression
        time6 = time.time()
        # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1))

        if args.trg_knn:
            # for csls-based neighborhoods
            knn_sense = xp.full(sense_size, -100)
            for i in range(0, sense_size, args.trg_batch):
                batch_end = min(i + args.trg_batch, sense_size)
                sim_sense_trg = cc[i:batch_end].dot(
                    zw[args.skip_top:cutoff_end].T)
                knn_sense[i:batch_end] = topk_mean(sim_sense_trg,
                                                   k=args.trg_sns_csls,
                                                   inplace=True)

            # calculate new target mappings
            trg_senses = lil_matrix(trg_senses.shape)
            for i in range(0, trg_size, args.trg_batch):
                sns_batch_end = min(i + args.trg_batch, trg_size)
                z_i = i + args.skip_top
                z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0])

                sims = zw[z_i:z_batch_end].dot(cc.T)
                sims -= knn_sense / 2  # equivalent to the real CSLS scores for NN
                best_idcs = sims.argmax(1).tolist()
                trg_senses[(list(range(i, sns_batch_end)),
                            best_idcs)] = sims.max(1).tolist()

                # second-to-lth-best
                for l in range(args.senses_per_trg - 1):
                    sims[(list(range(sims.shape[0])), best_idcs)] = 0.
                    best_idcs = sims.argmax(1).tolist()
                    trg_senses[(list(range(i, sns_batch_end)),
                                best_idcs)] = sims.max(1).tolist()

            trg_senses = get_sparse_module(trg_senses.tocsr())

        elif args.gd:
            ### TODO add args.skip_top calculations
            if args.gd_wd:
                true_it = (it - 1) * args.gd_map_steps
                map_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'mapping learning rate: {map_gd_lr}')

            for k in range(args.gd_map_steps):
                # st <- st + eta * (ew - st.dot(es)).dot(es.T)
                # allow up to sense_limit updates, clip gradient

                batch_grads = []
                for i in range(0, trg_size, args.trg_batch):
                    batch_end = min(i + args.trg_batch, trg_size)
                    tg_grad_b = (zw[i:batch_end] -
                                 trg_senses[i:batch_end].dot(cc)).dot(cc.T)

                    # proximal gradient
                    tg_grad_b += prox_lambda
                    tg_grad_b.clip(None, 0.0, out=tg_grad_b)
                    batch_grads.append(batch_sparse(tg_grad_b))

                tg_grad = get_sparse_module(vstack(batch_grads))
                del tg_grad_b

                if args.prox_decay:
                    prox_lambda *= args.base_prox_lambda

                ### TODO consider weight decay here as well (args.gd_wd)
                trg_senses -= map_gd_lr * tg_grad

                # allow up to sense_limit nonzeros
                if trg_sense_limit > 0:
                    trg_senses = trim_sparse(trg_senses,
                                             trg_sense_limit,
                                             clip=None)

            ### TODO consider finishing up with lasso (maybe only in final iteration)

        else:
            ### TODO add args.skip_top calculations
            # parallel LASSO (no cuda impl)
            cccpu = cc.get().T  # emb_dim * sense_size
            lasso_model.fit(cccpu, zw[:trg_size].get().T)
            ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it)
            trg_senses = lasso_model.sparse_coef_

        if args.verbose:
            print(
                f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros',
                file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        # Write target sense mapping
        with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl',
                  mode='wb') as tsnsfile:
            pickle.dump(trg_senses.get(), tsnsfile)

        ### update synset embeddings (10)
        time10 = time.time()
        if args.gd and args.gd_emb_steps > 0:
            ### TODO probably handle sizes and/or threshold sparse matrix
            if args.gd_wd:
                true_it = (it - 1) * args.gd_emb_steps
                emb_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'embedding learning rate: {emb_gd_lr}')

            ### replace block for no-source-tuning mode
            all_senses = trg_senses if args.start_src else get_sparse_module(
                vstack((src_senses.get(), trg_senses.get()), format='csr'),
                dtype=dtype)
            aw = zw[args.
                    skip_top:cutoff_end] if args.start_src else xp.concatenate(
                        (xw[args.skip_top:cutoff_end],
                         zw[args.skip_top:cutoff_end]))

            for i in range(args.gd_emb_steps):
                cc_grad = all_senses.T.dot(
                    aw - all_senses.dot(cc)) - args.ccreglamb * cc
                cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
                cc += emb_gd_lr * cc_grad

        else:
            ### TODO add args.skip_top calculations
            all_senses = get_sparse_module(
                vstack((src_senses, trg_senses), format='csr'))
            xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\
                        .dot(all_senses.toarray()).T  # sense_size * emb_dim
            all_sns_psinv = psinv(
                all_senses.get(), dtype, args.inv_delta
            )  ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same]
            cc[:] = all_sns_psinv.dot(xzecc)

        if args.verbose:
            print(f'synset embedding update: {time.time()-time10:.2f}',
                  file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        ### update projections (3,5)
        # write to zw and xw
        if args.orthogonal or not end:

            ### remove block for no-source-tuning mode
            # source side - mappings don't change so xecc is constant
            #if not args.start_src:  # need to do this anyway whenever cc updates
            time3 = time.time()
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)
            if args.verbose:
                print(f'source projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

            # target side - compute sense mapping first
            time3 = time.time()
            zecc.fill(0.)
            for i in range(0, trg_size, args.trg_batch):
                end_idx = min(i + args.trg_batch, trg_size)
                zecc += z[i:end_idx].T.dot(
                    get_sparse_module(trg_senses[i:end_idx]).toarray()).T
            u, s, vt = xp.linalg.svd(cc.T.dot(zecc))
            wz = vt.T.dot(u.T).astype(dtype)
            z.dot(wz, out=zw)
            if args.verbose:
                print(f'target projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

        ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc.

        # Objective function evaluation
        time_obj = time.time()
        trg_senses_l1 = float(trg_senses.sum())
        src_obj = (float(
            xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
        trg_obj = (float(
            xp.linalg.norm(
                zw[args.skip_top:cutoff_end] -
                get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2
        objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1  # TODO consider thresholding reg part
        if args.verbose:
            print(f'objective calculation: {time.time()-time_obj:.2f}',
                  file=sys.stderr)

        if objective - best_objective <= -args.threshold:
            last_improvement = it
            best_objective = objective

        # WordNet transduction evaluation (can't tune on this)
        if args.gold_pairs is not None:
            np_trg_senses = trg_senses.get()
            trg_corr = [
                p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold
            ]
            correct_mappings = len(trg_corr)
            domain_trgs = np_trg_senses[gold_trgs][:, gold_senses]
        else:
            correct_mappings = -1

        # Logging
        duration = time.time() - t
        if args.verbose:
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('objective: {0:.3f}'.format(objective), file=sys.stderr)
            print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1),
                  file=sys.stderr)
            if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0:
                print(
                    f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision',
                    file=sys.stderr)
            print(file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            print(
                f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}',
                file=log)
            log.flush()

        if end:
            break

        t = time.time()
        it += 1

    # Write mapped embeddings
    with open(args.src_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as srcfile:
        embeddings.write(src_words, xw, srcfile)
    with open(args.trg_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as trgfile:
        embeddings.write(trg_words, zw, trgfile)

    # Write target sense mapping
    with open(args.tsns_output, mode='wb') as tsnsfile:
        pickle.dump(trg_senses.get(), tsnsfile)
Пример #33
0
            ["level", "temperature", "usage", "Brightness", "RAM"])
        df_label_Num = pandas.DataFrame(polyData_Num, columns=columnNames)

        for column in columnNames:
            df_label[column] = pandas.Series(df_label_Num[column])

        # Get dataframes
        y_label = df_label["output"]
        X_label = df_label.drop(["output"], axis=1)

        # Split data training and testing ...
        X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(
            X_label, y_label, test_size=0.25, random_state=42)

        # Create the model
        regressor = Lasso()

        # find optimal alpha with grid search
        alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        param_grid = {'alpha': alpha}
        scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error']
        grid = GridSearchCV(estimator=regressor,
                            param_grid=param_grid,
                            scoring=scoring,
                            refit=scoring[0],
                            return_train_score=True,
                            cv=3)
        grid_result = grid.fit(X_train_label, y_train_label)

        print(
            f"Best Score: {abs(grid_result.best_score_)} - Best Params: {grid_result.best_params_} for label {label} ({df_label.shape})"
y_train = train.SalePrice.values
train = pd.DataFrame(all_data[:ntrain])
test = pd.DataFrame(all_data[ntrain:])

from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
#1
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
#2
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
#3
ENet = make_pipeline(RobustScaler(),
                     ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
#4
GBoost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)
#5
Пример #35
0
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

tuned_parameters = [{'alpha': alphas}]
n_folds = 3

clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']
plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)

# plot error lines showing +/- std. errors of the scores
std_error = scores_std / np.sqrt(n_folds)
Пример #36
0
from sklearn.linear_model import MultiTaskLasso, Lasso

rng = np.random.RandomState(42)

# Generate some 2D coefficients with sine waves with random frequency and phase
n_samples, n_features, n_tasks = 100, 30, 40
n_relevant_features = 5
coef = np.zeros((n_tasks, n_features))
times = np.linspace(0, 2 * np.pi, n_tasks)
for k in range(n_relevant_features):
    coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1))

X = rng.randn(n_samples, n_features)
Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks)

coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_

###############################################################################
# Plot support and time series
fig = plt.figure(figsize=(8, 5))
plt.subplot(1, 2, 1)
plt.spy(coef_lasso_)
plt.xlabel('Feature')
plt.ylabel('Time (or Task)')
plt.text(10, 5, 'Lasso')
plt.subplot(1, 2, 2)
plt.spy(coef_multi_task_lasso_)
plt.xlabel('Feature')
plt.ylabel('Time (or Task)')
plt.text(10, 5, 'MultiTaskLasso')
Пример #37
0
    def set_objective(self, X, y, lmbd):
        self.X, self.y, self.lmbd = X, y, lmbd

        n_samples = self.X.shape[0]
        self.clf = Lasso(alpha=self.lmbd/n_samples, fit_intercept=False, tol=0)
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
Пример #38
0
def cv_predict_fixture(generate_data_cv_predict, cross_fit, params):
    n_folds = 4
    # collect data
    (x, y, classifier) = generate_data_cv_predict

    if classifier:
        method = 'predict_proba'
    else:
        method = 'predict'

    if cross_fit:
        smpls = [
            (train, test)
            for train, test in KFold(n_splits=n_folds, shuffle=True).split(x)
        ]
    else:
        n_obs = len(y)
        smpls = train_test_split(np.arange(n_obs), test_size=0.23)
        smpls = [[np.sort(x)
                  for x in smpls]]  # only sorted indices are supported

    if params is None:
        est_params = None
    elif params == 'global':
        if method == 'predict_proba':
            est_params = {'C': 0.5}
        else:
            est_params = {'alpha': 0.5}
    else:
        assert params == 'per_fold'
        if method == 'predict_proba':
            if cross_fit:
                est_params = [{
                    'C': np.random.uniform()
                } for i in range(n_folds)]
            else:
                est_params = {'C': 1.}
        else:
            if cross_fit:
                est_params = [{
                    'alpha': np.random.uniform()
                } for i in range(n_folds)]
            else:
                est_params = {'alpha': 1.}

    if method == 'predict_proba':
        preds = _dml_cv_predict(LogisticRegression(),
                                x,
                                y,
                                smpls,
                                est_params=est_params,
                                method=method)
        preds_ut = _dml_cv_predict_ut_version(LogisticRegression(),
                                              x,
                                              y,
                                              smpls,
                                              est_params=est_params,
                                              method=method)[:, 1]
    else:
        preds = _dml_cv_predict(Lasso(),
                                x,
                                y,
                                smpls,
                                est_params=est_params,
                                method=method)
        preds_ut = _dml_cv_predict_ut_version(Lasso(),
                                              x,
                                              y,
                                              smpls,
                                              est_params=est_params,
                                              method=method)

    res_dict = {'preds': preds, 'preds_ut': preds_ut}

    return res_dict
Пример #39
0
# TODO: Add import statements
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso

# Assign the data to predictor and outcome variables
# TODO: Load the data
train_data = pd.read_csv('data_lasso.csv', header=None)
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

# TODO: Create the linear regression model with lasso regularization.
lasso_reg = Lasso()

# TODO: Fit the model.
lasso_reg.fit(X, y)

# TODO: Retrieve and print out the coefficients from the regression model.
reg_coef = lasso_reg.coef_
print(reg_coef)
Пример #40
0
max_depths = list(range(2, 10 + 1)) + [None]
d_max_depths = {'max_depth': max_depths}
d_max_depths_base = {'base_estimator__max_depth': max_depths}
Ks = {'n_neighbors': [1, 2, 3, 5, 10, 15, 25, 50, 100, 200]}

OUTCOME_MODEL_GRID = [
    ('LinearRegression', LinearRegression(), {}),
    ('LinearRegression_interact',
     make_pipeline(PolynomialFeatures(degree=2, interaction_only=True),
                   LinearRegression()), {}),
    ('LinearRegression_degree2',
     make_pipeline(PolynomialFeatures(degree=2), LinearRegression()), {}),
    # ('LinearRegression_degree3',
    #  make_pipeline(PolynomialFeatures(degree=3), LinearRegression()), {}),
    ('Ridge', Ridge(), alphas),
    ('Lasso', Lasso(), alphas),
    ('ElasticNet', ElasticNet(), alphas),
    ('KernelRidge', KernelRidge(), alphas),
    ('SVM_rbf', SVR(kernel='rbf'), d_Cs),
    ('SVM_sigmoid', SVR(kernel='sigmoid'), d_Cs),
    ('LinearSVM', LinearSVR(), d_Cs),
    # (SVR(kernel='linear'), d_Cs), # doesn't seem to work (runs forever)

    # TODO: add tuning of SVM gamma, rather than using the default "scale" setting
    # SVMs are sensitive to input scale
    ('Standardized_SVM_rbf',
     Pipeline([('standard', StandardScaler()),
               (SVM, SVR(kernel='rbf'))]), d_Cs_pipeline),
    ('Standardized_SVM_sigmoid',
     Pipeline([('standard', StandardScaler()),
               (SVM, SVR(kernel='sigmoid'))]), d_Cs_pipeline),
Пример #41
0
def regularize_by_l1(X_train,
                     X_test,
                     y_train,
                     y_test,
                     all_features,
                     N_k,
                     task,
                     N_repeat,
                     seed_no=0):
    ## 0. Input arguments:
    # X_train: array that contains training feature data
    # X_test: array that contains testing feature data
    # y_train: array that contains traning response data
    # y_test: array that contains testing response data
    # all_features: names of all features (column names of X_train)
    # N_k: number of folds to split into
    # task: type of supervised learning task: 'regression' or 'classification'
    # N_repeat: number of independent cross-validation runs, each run will generate one performance score
    # seed_no: seed number to be used in the first run, 'seed_start + 1' will be used for the second run, ...

    ## 1. Perform regularized classification/regression based on the specified task
    # regression
    if task == 'regression':
        # split data into K folds
        kf = KFold(n_splits=N_k, random_state=seed_no, shuffle=True)
        # find the optimal alpha (regularization factor) using K-fold cross validation on training data
        cv_regressor = LassoCV(cv=kf, random_state=seed_no)
        cv_regressor.fit(X_train, y_train)
        best_alpha = cv_regressor.alpha_
        # fit lasso regression using the optimal alpha
        final_learner = Lasso(alpha=best_alpha)
        final_learner.fit(X_train, y_train)
        # obtain selected features by fitted lasso regression model (features with coefficients > 0)
        select_features = all_features[(final_learner.coef_ != 0).flatten()]
        N_select = len(select_features)
        # perform K-fold cross validation to obtain the training performance of fitted lasso regression model
        train_metric = []
        for i in range(0, N_repeat):
            cv_kf = KFold(n_splits=N_k, random_state=i + 1, shuffle=True)
            r2 = cross_val_score(final_learner,
                                 X_train,
                                 y_train,
                                 cv=cv_kf,
                                 scoring='r2')
            mse = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='neg_mean_squared_error')
            train_metric.append({'r2': np.mean(r2), 'mse': np.mean(mse)})
        train_metric_df = pd.DataFrame(train_metric)
        # implement fitted lasso regression model on the testing set and obtain the testing performance
        y_pred = final_learner.predict(X_test)
        test_r2 = r2_score(y_test, y_pred)
        test_mse = mean_squared_error(y_test, y_pred)
        test_metric = {'r2': test_r2, 'test_mse': test_mse}

    # classification
    if task == 'classification':
        # straitified split for classification tasks
        kf = StratifiedKFold(n_splits=N_k, random_state=seed_no, shuffle=True)
        # find the optimal C (regularization factor) using K-fold cross validation on training data
        cv_classifier = LogisticRegressionCV(penalty='l1',
                                             solver='liblinear',
                                             cv=kf,
                                             random_state=seed_no)
        cv_classifier.fit(X_train, y_train)
        best_c = float(cv_classifier.C_)
        # fit logistic regression using the optimal C
        final_learner = LogisticRegression(penalty='l1',
                                           solver='liblinear',
                                           C=best_c,
                                           random_state=seed_no)
        final_learner.fit(X_train, y_train)
        # obtain selected features by fitted logistic regression model (features with coefficients > 0)
        select_features = all_features[(final_learner.coef_ != 0).flatten()]
        N_select = len(select_features)
        # perform K-fold cross validation to obtain the training performance of fitted logistic regression model
        train_metric = []
        for i in range(0, N_repeat):
            cv_kf = StratifiedKFold(n_splits=N_k,
                                    random_state=i + 1,
                                    shuffle=True)
            auc = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='roc_auc')
            bac = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='balanced_accuracy')
            f1 = cross_val_score(final_learner,
                                 X_train,
                                 y_train,
                                 cv=cv_kf,
                                 scoring='f1')
            train_metric.append({
                'auc': np.mean(auc),
                'bac': np.mean(bac),
                'f1': np.mean(f1)
            })
        train_metric_df = pd.DataFrame(train_metric)
        # compare with testing response data, compute metrics
        y_pred_prob = final_learner.predict_proba(X_test)[:, 1]
        y_pred = final_learner.predict(X_test)
        test_auc = roc_auc_score(y_test, y_pred_prob)
        test_bac = balanced_accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)
        test_metric = {'auc': test_auc, 'bac': test_bac, 'f1': test_f1}

    return final_learner, select_features, train_metric_df, test_metric
Пример #42
0
degree = 10
# iter_array=[1000,5000,10000,50000,100000,500000]
# train_err = []
# test_err = []
print("Number of Training points are ", train_x.shape[0])
print("Number of Testing points are ", test_x.shape[0])
print("")
poly = PolynomialFeatures(degree=degree, include_bias=False)
modified_train_x = poly.fit_transform(train_x)
modified_test_x = poly.fit_transform(test_x)
# for max_iter in iter_array:
print("Lasso with default alpha")
print("\n")
print("\n")
reg = Lasso()
reg.fit(modified_train_x, train_y)
print("Lasso Train RMSE is: ",
      math.sqrt(mean_squared_error(train_y, reg.predict(modified_train_x))))
print("Lasso Test RMSE is: ",
      math.sqrt(mean_squared_error(test_y, reg.predict(modified_test_x))))
# train_err.append(math.sqrt(mean_squared_error(train_y,reg.predict(modified_train_x))))
# test_err.append(math.sqrt(mean_squared_error(test_y,reg.predict(modified_test_x))))
# print(reg.coef_)
# plt.xlabel('Iterations')
# plt.ylabel('RMSE')
# plt.plot(iter_array , train_err , 'bo-', label='Training')
# plt.plot(iter_array , test_err , 'ro-' , label='Test')
# plt.legend()
print("\n")
print("Polynomial Regression with degree 10")
Пример #43
0
import pandas as pd
from sklearn.linear_model import Lasso
import pickle

dataset = pd.read_csv('Jan_2019.csv')
X = dataset.iloc[0:31, 2:6]
y = dataset.iloc[0:31, 6:10]
lassoreg = Lasso(alpha=0.1)
lassoreg.fit(X, y)
pickle.dump(lassoreg, open('model_jan.pkl', 'wb'))

dataset = pd.read_csv('Feb_2019.csv')
X = dataset.iloc[0:28, 2:6]
y = dataset.iloc[0:28, 6:10]
lassoreg = Lasso(alpha=0.1)
lassoreg.fit(X, y)
pickle.dump(lassoreg, open('model_feb.pkl', 'wb'))

dataset = pd.read_csv('Mar_2019.csv')
X = dataset.iloc[0:31, 2:6]
y = dataset.iloc[0:31, 6:10]
lassoreg = Lasso(alpha=0.1)
lassoreg.fit(X, y)
pickle.dump(lassoreg, open('model_mar.pkl', 'wb'))

dataset = pd.read_csv('Apr_2019.csv')
X = dataset.iloc[0:30, 2:6]
y = dataset.iloc[0:30, 6:10]
lassoreg = Lasso(alpha=0.1)
lassoreg.fit(X, y)
pickle.dump(lassoreg, open('model_apr.pkl', 'wb'))
Пример #44
0
 def __init__(self):
     self.model = Lasso()
Пример #45
0
    def _validate_estimator_params(self, estimator, kwargs):
        """Validate estimator and parameters inputs

        Parameters
        ----------
        estimator: str
            Estimator name
        kwargs: keyword arguments
            Grid search named parameters

        Returns
        -------
        estimator: sklearn.Estimator
            sklearn estimator, implementing `fit` and `predict`
        params: dict
            Grid search params
        
        TODO: think about default ranges for grid search
        """
        if not isinstance(estimator, str):
            raise TypeError('estimator argument must be str, but received %s' %
                            type(estimator))
        _estimator = estimator.lower()
        if _estimator == 'svr':
            _kernel = kwargs.get('kernel', 'rbf')
            _C = kwargs.get('C', np.logspace(-4, 4, 5))
            _epsilon = kwargs.get('epsilon', np.logspace(-4, 4, 5))
            _gamma = kwargs.get('gamma', 'auto')
            _degree = kwargs.get('degree', 3)
            return SVR(kernel=_kernel, degree=_degree), {
                'C': _C,
                'epsilon': _epsilon
            }
        if _estimator == 'ridge':
            _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20))
            return Ridge(), {'alpha': _alpha}
        if _estimator == 'lasso':
            _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20))
            return Lasso(), {'alpha': _alpha}
        if _estimator == 'lars':
            _n_nonzero_coefs = kwargs.get('n_nonzero_coefs', np.inf)
            return Lars(), {'n_nonzero_coefs': _n_nonzero_coefs}
        if _estimator == 'elasticnet':
            _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20))
            return ElasticNet(), {'alpha': _alpha}
        if _estimator == 'sgd' or _estimator == 'sgdregressor':
            _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20))
            return SGDRegressor(), {'alpha': _alpha}
        if _estimator == 'randomforest':
            _n_estimators = range(5, 30, 5)
            return RandomForestRegressor(), {'n_estimators': _n_estimators}
        if _estimator == 'adaboost':
            _n_estimators = range(10, 60, 5)
            _learning_rate = np.logspace(-2, 1, 4)
            return AdaBoostRegressor(), {
                'n_estimators': _n_estimators,
                'learning_rate': _learning_rate
            }
        if _estimator == 'gradientboosting':
            _n_estimators = range(10, 60, 5)
            _learning_rate = np.logspace(-2, 1, 4)
            return GradientBoostingRegressor(), {
                'n_estimators': _n_estimators,
                'learning_rate': _learning_rate
            }
        if _estimator == 'lstm':
            _layers = kwargs.get('layers',
                                 [1, self._order, 2 * self._order, 1])
            _pct_dropout = kwargs.get('pct_dropout', 0.5)
            return LSTM(layers=_layers, pct_dropout=_pct_dropout), {}
Пример #46
0
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from mlxtend.regressor import StackingCVRegressor

y = dataset['PM']
x = dataset.drop(columns=['PM','CBWD'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
lr = LinearRegression()
dtr = DecisionTreeRegressor()
svr_rbf = SVR(kernel='rbf', gamma='auto')
knr = KNeighborsRegressor()
ridge = Ridge()
lasso = Lasso()
regression_models = [lr, dtr, svr_rbf, knr, ridge, lasso]

sclf = StackingCVRegressor(regression_models, meta_regressor=ridge)
sclf.fit(x_train, y_train)
pred = sclf.predict(x_test)

print(sclf.score(x_train, y_train))
%matplotlib inline
plt.scatter([i*10 for i in range(len(y_test))], y_test, c='red', lw=1)
plt.plot([i*10 for i in range(len(y_test))], pred, c='black', lw=1)
plt.show()


`# In[ ]:
Пример #47
0
    # Read the modified dataset
    y = dataset_oversampling['MWD']
    X = dataset_oversampling.drop(['Mn', 'MWD'], axis=1)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
    
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.fit_transform(X_train)
    
    # Select the best model parameters through GridSearchCV
    parameters = {
            'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,0.1,1, 5,10,20,100]
            }
    las = Lasso()
    gsearch = GridSearchCV(	estimator = las, 
							param_grid = parameters, 
							scoring='neg_mean_squared_error', 
							n_jobs=4, iid=False, cv=5)
    
    gsearch.fit(X_train,y_train)
    
    print("Best parameters selected: %s"%gsearch.best_params_)
    
    # Print the R2 and RMSE values of the predicative model & generate the experimental vs. predicted values into excel files
    preds = gsearch.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, preds))
    print("Training data RMSE: %f" % (rmse))
    r2 = r2_score(y_train, preds)
    print("Training data R2: %f" % (r2))
Пример #48
0
        if linear_cv_mse == [] or mse1 < min(linear_cv_mse):
            best_lm_mse = mse1
            lr_model_best = lr_model
        linear_cv_mse.append(mse1)

        # Train the ridge model and save if it is best model
        rg_model = Ridge(alpha=20)
        rg_model.fit(X_train1, y_train1)
        mse2 = mean_squared_error(y_cv, rg_model.predict(X_cv))
        if ridge_cv_mse == [] or mse2 < min(ridge_cv_mse):
            best_rg_mse = mse2
            rg_model_best = rg_model
        ridge_cv_mse.append(mse2)

        # Train the Lasso model and save if it is best model
        lasso_model = Lasso(alpha=20)
        lasso_model.fit(X_train1, y_train1)
        mse3 = mean_squared_error(y_cv, lasso_model.predict(X_cv))
        if lasso_cv_mse == [] or mse3 < min(lasso_cv_mse):
            best_lasso_mse = mse3
            lasso_model_best = lasso_model
        lasso_cv_mse.append(mse3)

    ## Print the MSE for the linear best model from CV
    print("Best Linear model produced ", best_lm_mse, " MSE on CV")
    linear_predictions = lr_model_best.predict(X_test)
    linear_mse.append(mean_squared_error(y_test, linear_predictions))

    # Print the MSE for the ridge best model from CV
    print("Best Ridge model produced ", best_rg_mse, " MSE on CV")
    ridge_predictions = rg_model_best.predict(X_test)
#线性回归与L2正则化
X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LinearRegression().fit(X_train, y_train)

print(lr.score(X_train, y_train), lr.score(X_test, y_test))

ridge = Ridge(alpha=0.1).fit(X_train,
                             y_train)  #L2正则化,alpha为正则化参数,越大则越趋向0,泛化性越强
print(ridge.score(X_train, y_train), ridge.score(X_test, y_test))

mglearn.plots.plot_ridge_n_samples()
plt.show()

#L1正则化
lasso = Lasso(alpha=0.1).fit(X_train, y_train)
print(lasso.score(X_train, y_train), lasso.score(X_test, y_test))

#分类的线性模型
X, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))

for model, ax in zip([LinearSVC(C=1), LogisticRegression(C=1)],
                     axes):  #C为正则化参数,C越大正则化越弱
    clf = model.fit(X, y)
    mglearn.plots.plot_2d_separator(clf,
                                    X,
                                    fill=False,
                                    eps=0.5,
                                    ax=ax,
                                    alpha=0.7)
# TODO: Add import statements
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Assign the data to predictor and outcome variables
# TODO: Load the data
train_data = pd.read_csv('data.csv', header=None)
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

# TODO: Create the standardization scaling object.
scaler = StandardScaler()

# TODO: Fit the standardization parameters and scale the data.
X_scaled = scaler.fit_transform(X)

# TODO: Create the linear regression model with lasso regularization.
lasso_reg = Lasso()

# TODO: Fit the model.
lasso_reg.fit(X_scaled, y)

# TODO: Retrieve and print out the coefficients from the regression model.
reg_coef = lasso_reg.coef_
print(reg_coef)
Пример #51
0


##########################################################
# 	Lasso Regression 
##########################################################


n_alphas = 100
alphas = np.logspace(-5, 5, 100 )

coefs = list()
errors = list()

for a in alphas : 
	lasso = Lasso(alpha=a)
	lasso.fit(X_train, y_train)
	coefs.append(lasso.coef_)
	y_pred = lasso.predict(X_test)
	errors.append(round(rmsle(y_test, y_pred),4))  




plt.plot(alphas, errors)
plt.plot(alphas, [baseline_error for _ in alphas])
plt.xscale('log')
plt.ylim([0.1, 0.3])
plt.show()

Пример #52
0
ridge_reg = Ridge(alpha=1, solver = "cholesky")
ridge_reg.fit(X,y)
ridge_reg.predict([[1.5]])
# Using Stochastic Gradient Descent
sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X,y.ravel())
sgd_reg.predict([[1.5]])
# Least Absolute Shrinkage and Selection Operator Regression - Lasso Regression
# Similar to Ridge regression but instead of using l2 norm , we use l1 norm.

# An important characteristic of Lasso regression is that it tends to completely eliminate the weights of the least
# important features (i.e. set them to zero). In another words, Lasso regression automatically performs feature Selection
# and outputs a sparse model

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X,y)
lasso_reg.predict([[1.5]])

# ElasticNet - Elastic Net is a middle ground between Ridge Regression and Lasso Regression. The regularized term is a simple
# mix of both Ridge and Lasso's regularization terms, and you can control the mix ratio r. When r=0, Elastic Net is equivalent
# to Ridge Regression and when r = 1 it is equivalent to Lasso Regression
# J(theta) = MSE + r * (Lasso regularized term) + ( 1 - r) * (Ridge regularized item)
# It is almost always preferrable to have at least a little bit of regularization , so generally you should avoid plain
# Linear regression. Ridge is a good default , but if you suspect that only a few features are actually useful, you should
# prefer Lasso or Elastic Net since they tend to reduce the useless features' weights down to zero.  In general, Elastic Net
# is preferred over Lasso since Lasso may behave erratically when the number of features is greater than the number
# of training instances or when several features are strongly correlated.
# l1_ratio corresponds to the mix ratio r
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio = 0.1)
train["SalePrice"] = np.log1p(train["SalePrice"])

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(
    lambda x: skew(x.dropna()))  #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.65]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = boxcox1p(all_data[skewed_feats], 0.14)

all_data = pd.get_dummies(all_data)

all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

#### models selection
lasso = Lasso(alpha=0.0002)
model = lasso

### prediction
model.fit(X_train, y)

preds = np.expm1(model.predict(X_test))
solution = pd.DataFrame({"id": test.Id, "SalePrice": preds})
solution.to_csv("full_features_lasso.csv", index=False)
Пример #54
0
def set_algorithm(algorithm, *args, **kwargs):
    """ Setup the algorithm to use in subsequent prediction analyses.

    Args:
        algorithm: The prediction algorithm to use. Either a string or an
                    (uninitialized) scikit-learn prediction object. If string,
                    must be one of 'svm','svr', linear','logistic','lasso',
                    'lassopcr','lassoCV','ridge','ridgeCV','ridgeClassifier',
                    'randomforest', or 'randomforestClassifier'
        kwargs: Additional keyword arguments to pass onto the scikit-learn
                clustering object.

    Returns:
        predictor_settings: dictionary of settings for prediction

    """

    # NOTE: function currently located here instead of analysis.py to avoid circular imports

    predictor_settings = {}
    predictor_settings['algorithm'] = algorithm

    def load_class(import_string):
        class_data = import_string.split(".")
        module_path = '.'.join(class_data[:-1])
        class_str = class_data[-1]
        module = importlib.import_module(module_path)
        return getattr(module, class_str)

    algs_classify = {
        'svm': 'sklearn.svm.SVC',
        'logistic': 'sklearn.linear_model.LogisticRegression',
        'ridgeClassifier': 'sklearn.linear_model.RidgeClassifier',
        'ridgeClassifierCV': 'sklearn.linear_model.RidgeClassifierCV',
        'randomforestClassifier': 'sklearn.ensemble.RandomForestClassifier'
    }
    algs_predict = {
        'svr': 'sklearn.svm.SVR',
        'linear': 'sklearn.linear_model.LinearRegression',
        'lasso': 'sklearn.linear_model.Lasso',
        'lassoCV': 'sklearn.linear_model.LassoCV',
        'ridge': 'sklearn.linear_model.Ridge',
        'ridgeCV': 'sklearn.linear_model.RidgeCV',
        'randomforest': 'sklearn.ensemble.RandomForest'
    }

    if algorithm in algs_classify.keys():
        predictor_settings['prediction_type'] = 'classification'
        alg = load_class(algs_classify[algorithm])
        predictor_settings['predictor'] = alg(*args, **kwargs)
    elif algorithm in algs_predict:
        predictor_settings['prediction_type'] = 'prediction'
        alg = load_class(algs_predict[algorithm])
        predictor_settings['predictor'] = alg(*args, **kwargs)
    elif algorithm == 'lassopcr':
        predictor_settings['prediction_type'] = 'prediction'
        from sklearn.linear_model import Lasso
        from sklearn.decomposition import PCA
        predictor_settings['_lasso'] = Lasso()
        predictor_settings['_pca'] = PCA()
        predictor_settings['predictor'] = Pipeline(steps=[(
            'pca',
            predictor_settings['_pca']), ('lasso',
                                          predictor_settings['_lasso'])])
    elif algorithm == 'pcr':
        predictor_settings['prediction_type'] = 'prediction'
        from sklearn.linear_model import LinearRegression
        from sklearn.decomposition import PCA
        predictor_settings['_regress'] = LinearRegression()
        predictor_settings['_pca'] = PCA()
        predictor_settings['predictor'] = Pipeline(
            steps=[('pca', predictor_settings['_pca']
                    ), ('regress', predictor_settings['_regress'])])
    else:
        raise ValueError("""Invalid prediction/classification algorithm name.
            Valid options are 'svm','svr', 'linear', 'logistic', 'lasso',
            'lassopcr','lassoCV','ridge','ridgeCV','ridgeClassifier',
            'randomforest', or 'randomforestClassifier'.""")

    return predictor_settings
Пример #55
0
import numpy as np

# Regressions
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDClassifier, SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier


models_regression = [LinearRegression(),
                     Ridge(random_state=42, max_iter=100),
                     Lasso(random_state=42, max_iter=100),
                     SVR(gamma='scale'),
                     AdaBoostRegressor(random_state=42, n_estimators=10),
                     GradientBoostingRegressor(random_state=42, max_depth=3, n_estimators=10),
                     RandomForestRegressor(n_estimators=10, random_state=42, max_depth=3)]

models_classification = [LogisticRegression(solver='lbfgs', max_iter=100, random_state=42),
                         SVC(gamma='scale', max_iter=100),
                         AdaBoostClassifier(random_state=42, n_estimators=10),
                         GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=10),
                         RandomForestClassifier(n_estimators=10, random_state=42, max_depth=3)]


def test_sklearn_estimator():
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_length', 'sepal_width', 'petal_length']
# Generate synthetic images, and projections
l = 128
proj_operator = build_projection_operator(l, l / 7.)
data = generate_synthetic_data()
proj = proj_operator * data.ravel()[:, np.newaxis]
proj += 0.15 * np.random.randn(*proj.shape)

# Reconstruction with L2 (Ridge) penalization
rgr_ridge = Ridge(alpha=0.2)
rgr_ridge.fit(proj_operator, proj.ravel())
rec_l2 = rgr_ridge.coef_.reshape(l, l)

# Reconstruction with L1 (Lasso) penalization
# the best value of alpha was determined using cross validation
# with LassoCV
rgr_lasso = Lasso(alpha=0.001)
rgr_lasso.fit(proj_operator, proj.ravel())
rec_l1 = rgr_lasso.coef_.reshape(l, l)

plt.figure(figsize=(8, 3.3))
plt.subplot(131)
plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest')
plt.axis('off')
plt.title('original image')
plt.subplot(132)
plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation='nearest')
plt.title('L2 penalization')
plt.axis('off')
plt.subplot(133)
plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation='nearest')
plt.title('L1 penalization')
Пример #57
0
 def test_binary_treatments(self):
     np.random.seed(123)
     # Generate data with binary treatments
     log_odds = np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_T) + \
         TestOrthoForest.eta_sample(TestOrthoForest.n)
     T_sigmoid = 1 / (1 + np.exp(-log_odds))
     T = np.array([np.random.binomial(1, p) for p in T_sigmoid])
     TE = np.array([self._exp_te(x) for x in TestOrthoForest.X])
     Y = np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_Y) + \
         T * TE + TestOrthoForest.epsilon_sample(TestOrthoForest.n)
     # Instantiate model with default params. Using n_jobs=1 since code coverage
     # does not work well with parallelism.
     est = DROrthoForest(n_trees=10,
                         n_jobs=1,
                         propensity_model=LogisticRegression(),
                         model_Y=Lasso(),
                         propensity_model_final=LogisticRegressionCV(
                             penalty='l1', solver='saga'),
                         model_Y_final=WeightedLassoCVWrapper())
     # Test inputs for binary treatments
     # --> Check that one can pass in regular lists
     est.fit(list(Y),
             list(T),
             X=list(TestOrthoForest.X),
             W=list(TestOrthoForest.W))
     # --> Check that it fails correctly if lists of different shape are passed in
     self.assertRaises(ValueError, est.fit, Y[:TestOrthoForest.n // 2],
                       T[:TestOrthoForest.n // 2], TestOrthoForest.X,
                       TestOrthoForest.W)
     # --> Check that it works when T, Y have shape (n, 1)
     est.fit(Y.reshape(-1, 1),
             T.reshape(-1, 1),
             X=TestOrthoForest.X,
             W=TestOrthoForest.W)
     # --> Check that it fails correctly when T has shape (n, 2)
     self.assertRaises(ValueError, est.fit, Y,
                       np.ones((TestOrthoForest.n, 2)), TestOrthoForest.X,
                       TestOrthoForest.W)
     # --> Check that it fails correctly when the treatments are not numeric
     self.assertRaises(ValueError, est.fit, Y,
                       np.array(["a"] * TestOrthoForest.n),
                       TestOrthoForest.X, TestOrthoForest.W)
     # Check that outputs have the correct shape
     out_te = est.const_marginal_effect(TestOrthoForest.x_test)
     self.assertSequenceEqual((TestOrthoForest.x_test.shape[0], 1, 1),
                              out_te.shape)
     # Test binary treatments with controls
     est = DROrthoForest(n_trees=100,
                         min_leaf_size=10,
                         max_depth=30,
                         subsample_ratio=0.30,
                         bootstrap=False,
                         n_jobs=1,
                         propensity_model=LogisticRegression(C=1 / 0.024,
                                                             penalty='l1',
                                                             solver='saga'),
                         model_Y=Lasso(alpha=0.024),
                         propensity_model_final=LogisticRegressionCV(
                             penalty='l1', solver='saga'),
                         model_Y_final=WeightedLassoCVWrapper())
     est.fit(Y,
             T,
             X=TestOrthoForest.X,
             W=TestOrthoForest.W,
             inference="blb")
     self._test_te(est,
                   TestOrthoForest.expected_exp_te,
                   tol=0.7,
                   treatment_type='discrete')
     self._test_ci(est,
                   TestOrthoForest.expected_exp_te,
                   tol=1.5,
                   treatment_type='discrete')
     # Test binary treatments without controls
     log_odds = TestOrthoForest.eta_sample(TestOrthoForest.n)
     T_sigmoid = 1 / (1 + np.exp(-log_odds))
     T = np.array([np.random.binomial(1, p) for p in T_sigmoid])
     Y = T * TE + TestOrthoForest.epsilon_sample(TestOrthoForest.n)
     est.fit(Y, T, X=TestOrthoForest.X, inference="blb")
     self._test_te(est,
                   TestOrthoForest.expected_exp_te,
                   tol=0.5,
                   treatment_type='discrete')
     self._test_ci(est,
                   TestOrthoForest.expected_exp_te,
                   tol=1.5,
                   treatment_type='discrete')
def trainModel(train, out_p=None, method="ET", is_regr=False, logger=None):
    """
    Train a regression or classification model F such that Y=F(X)
    
    Input:
        train (dictionary): the training data that looks like {"X": df_X, "Y": df_Y, "C": df_C}
            ...train["X"] is the feature, output from the computeFeatures() function in computeFeatures.py
            ...train["Y"] is the response, ouput from the computeFeatures() function in computeFeatures.py
            ...train["C"] is the crowd information, output from the computeFeatures() function also
        out_p (str): the path for saving the trained model (optional)
        method (str): the method for training the model
        is_regr (bool): regression or classification (see computeFeatures.py)
        logger: the python logger created by the generateLogger() function

    Output:
        model: the trained machine learning model
    """
    log("Training model with " + str(train["X"].shape[1]) + " features...",
        logger)

    # Build model
    multi_output = bool(len(train["Y"]) > 1 and train["Y"].shape[1] > 1)
    if is_regr:
        if method == "RF":
            model = RandomForestRegressor(n_estimators=200,
                                          max_features=90,
                                          min_samples_split=2,
                                          n_jobs=-1)
        elif method == "ET":
            model = ExtraTreesRegressor(n_estimators=200,
                                        max_features=180,
                                        min_samples_split=32,
                                        n_jobs=-1)
        elif method == "SVM":
            model = SVR(max_iter=1000, C=100, gamma=0.01)
            if multi_output: model = MultiOutputRegressor(model, n_jobs=-1)
        elif method == "RLR":
            model = HuberRegressor(max_iter=1000)
            if multi_output: model = MultiOutputRegressor(model, n_jobs=-1)
        elif method == "LR":
            model = LinearRegression()
            if multi_output: model = MultiOutputRegressor(model, n_jobs=-1)
        elif method == "EN":
            model = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=1000)
            if multi_output: model = MultiOutputRegressor(model, n_jobs=-1)
        elif method == "LA":
            model = Lasso(alpha=0.01, max_iter=1000)
            if multi_output: model = MultiOutputRegressor(model, n_jobs=-1)
        elif method == "MLP":
            model = MLPRegressor(hidden_layer_sizes=(128, 64))
        elif method == "KN":
            model = KNeighborsRegressor(n_neighbors=10, weights="uniform")
        elif method == "DT":
            model = DecisionTreeRegressor()
        else:
            m = method[:2]
            if m in ["RF", "ET"]:
                # parse tuning parameters
                p = method.split("-")
                log(
                    p[0] + ", n_estimators=" + p[1] + ", max_features=" +
                    p[2] + ", min_samples_split=" + p[3], logger)
                for i in range(1, len(p)):
                    if p[i] == "None": p[i] = None
                    elif p[i] == "auto": p[i] = "auto"
                    else: p[i] = int(p[i])
                if m == "RF":
                    model = RandomForestRegressor(n_estimators=p[1],
                                                  max_features=p[2],
                                                  min_samples_split=p[3],
                                                  random_state=0,
                                                  n_jobs=-1)
                elif m == "ET":
                    model = ExtraTreesRegressor(n_estimators=p[1],
                                                max_features=p[2],
                                                min_samples_split=p[3],
                                                random_state=0,
                                                n_jobs=-1)
            else:
                log("ERROR: method " + method + " is not supported", logger)
                return None
    else:
        if method == "RF":
            model = RandomForestClassifier(n_estimators=1000,
                                           max_features=30,
                                           min_samples_split=2,
                                           n_jobs=-1)
        elif method == "ET":
            model = ExtraTreesClassifier(n_estimators=1000,
                                         max_features=60,
                                         min_samples_split=32,
                                         n_jobs=-1)
        elif method == "SVM":
            model = SVC(max_iter=5000, kernel="rbf", probability=True)
        elif method == "MLP":
            model = MLPClassifier(hidden_layer_sizes=(128, 64))
        elif method == "KN":
            model = KNeighborsClassifier(n_neighbors=10, weights="uniform")
        elif method == "LG":
            model = LogisticRegression(penalty="l1", C=1)
        elif method == "HCR":
            model = ExtraTreesClassifier(n_estimators=1000,
                                         max_features=90,
                                         min_samples_split=32,
                                         n_jobs=-1)
            model = HybridCrowdClassifier(base_estimator=model, logger=logger)
        elif method == "CR":
            model = HybridCrowdClassifier(logger=logger)
        elif method == "DT":
            model = DecisionTreeClassifier(min_samples_split=20,
                                           max_depth=8,
                                           min_samples_leaf=5)
        elif method == "Base1":
            model = DummyClassifier(strategy="stratified")
        elif method == "Base2":
            model = DummyClassifier(strategy="uniform")
        elif method == "Base3":
            model = DummyClassifier(strategy="constant", constant=1)
        else:
            m = method[:2]
            if m in ["RF", "ET"]:
                # parse tuning parameters
                p = method.split("-")
                log(
                    p[0] + ", n_estimators=" + p[1] + ", max_features=" +
                    p[2] + ", min_samples_split=" + p[3], logger)
                for i in range(1, len(p)):
                    if p[i] == "None": p[i] = None
                    elif p[i] == "auto": p[i] = "auto"
                    else: p[i] = int(p[i])
                if m == "RF":
                    model = RandomForestClassifier(n_estimators=p[1],
                                                   max_features=p[2],
                                                   min_samples_split=p[3],
                                                   random_state=0,
                                                   n_jobs=-1)
                elif m == "ET":
                    model = ExtraTreesClassifier(n_estimators=p[1],
                                                 max_features=p[2],
                                                 min_samples_split=p[3],
                                                 random_state=0,
                                                 n_jobs=-1)
            else:
                log("ERROR: method " + method + " is not supported", logger)
                return None

    X, Y = copy.deepcopy(train["X"]), copy.deepcopy(train["Y"])

    # For one-class classification task, we only want to use the minority class (because we are sure that they are labeled)
    if not is_regr and method == "IF":
        y_minor = findLeastCommon(Y)
        select_y = (Y == y_minor)
        X, Y = X[select_y], Y[select_y]

    # Fit data to the model
    model.fit(X, np.squeeze(Y))

    # Save and return model
    if out_p is not None:
        joblib.dump(model, out_p)
        log("Model saved at " + out_p, logger)
    return model
Пример #59
0
X, Y = make_friedman1(n_samples=500, n_features=5)

###########################
# On représente ces données.

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot()
ax.plot(X[:, 0], Y, '.')

##########################
# On choisira un modèle de régression linéaire
# avec une contrainte sur les coefficients
# `Lasso <http://scikit-learn.org/stable/modules/
# generated/sklearn.linear_model.Lasso.html>`_.

reglin = Lasso()
reglin.fit(X, Y)

##############################
# L'optimisation du modèle produit une droite
# dont les coefficients sont :
print(reglin.coef_, reglin.intercept_)

###############################
# On reprend le premier graphe est on y ajoute
# la droite qui correspond à la régression linéaire
# uniquement sur la première dimension.

reglin = Lasso()
reglin.fit(X[:, :1], Y)
Пример #60
0
plt.ylabel('回归系数');


# # 六,lasso

# In[64]:


#lasso是在linear_model下
from sklearn.linear_model import Lasso


# In[65]:


las = Lasso(alpha = 0.05)   #alpha为惩罚系数,值越大惩罚力度越大
las.fit(aba.iloc[:, :-1], aba.iloc[:, -1])


# In[67]:


las.coef_


# In[68]:


def regularize(xMat,yMat):
    inxMat = xMat.copy()                   #数据拷贝
    inyMat = yMat.copy()