def test_1d_multioutput_enet_and_multitask_enet_cv():
    X, y, _, _ = build_dataset(n_features=10)
    y = y[:, np.newaxis]
    clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf.fit(X, y[:, 0])
    clf1 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    assert_almost_equal(clf.l1_ratio_, clf1.l1_ratio_)
    assert_almost_equal(clf.alpha_, clf1.alpha_)
    assert_almost_equal(clf.coef_, clf1.coef_[0])
    assert_almost_equal(clf.intercept_, clf1.intercept_[0])
Пример #2
0
class _MultiTaskElasticNetCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Пример #3
0
def train_glm_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
) -> BaseEstimator:
    """Train a basic Generalized Linear Model (GLM)

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    gl_model : BaseEstimator
        the trained model
    """
    # Initialize GLM
    gl_model = MultiTaskElasticNetCV(
        alphas=None,
        cv=3,
        random_state=123,
        n_jobs=-1,
        normalize=False,
        selection="random",
        verbose=verbose,
    )

    # train GLM
    t0 = time.time()
    gl_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gl_model
def elastic_net(X,Y):
    print(X.shape)
    clf = MultiTaskElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, 
                                normalize=False, max_iter=1000, tol=0.0001, cv=None, copy_X=True, 
                                verbose=0, n_jobs=1, random_state=None, selection='cyclic')
    
    fit=clf.fit(X,Y)
    sfm = SelectFromModel(fit,prefit=True)
    values= SelectFromModel.get_support(sfm,indices=True)
    new_features = sfm.transform(X)
   
    return new_features,values
Пример #5
0
def train_multi_elasticnet(train_features, train_labels, num_alphas,
                           skip_cross_validation, alpha, l1_ratio, num_jobs):
    """
  Performs the cross validation of multi elastic net model, and returns the trained model
  with best params. Assume features are scaled/normalized. Assumes train_labels has more
  than one column.
  """

    best_alpha = alpha
    best_l1_ratio = l1_ratio
    max_iter = 10000
    tol = 0.0005
    if not skip_cross_validation:
        # use 5 fold cross validation
        model = MultiTaskElasticNetCV(l1_ratio=[
            0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999,
            0.9999
        ],
                                      max_iter=max_iter,
                                      cv=5,
                                      n_alphas=num_alphas,
                                      n_jobs=num_jobs,
                                      normalize=False,
                                      tol=tol)
        model.fit(train_features, train_labels)
        best_alpha = model.alpha_
        best_l1_ratio = model.l1_ratio_
        #print("number of iterations were {}".format(model.n_iter_))

    model = MultiTaskElasticNet(alpha=best_alpha,
                                l1_ratio=best_l1_ratio,
                                normalize=False,
                                max_iter=max_iter,
                                tol=tol)
    model.fit(train_features, train_labels)

    return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
def test_multitask_enet_and_lasso_cv():
    X, y, _, _ = build_dataset(n_features=50, n_targets=3)
    clf = MultiTaskElasticNetCV(cv=3).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.00556, 3)
    clf = MultiTaskLassoCV(cv=3).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.00278, 3)

    X, y, _, _ = build_dataset(n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=10, eps=1e-3, max_iter=100,
                                l1_ratio=[0.3, 0.5], tol=1e-3, cv=3)
    clf.fit(X, y)
    assert 0.5 == clf.l1_ratio_
    assert (3, X.shape[1]) == clf.coef_.shape
    assert (3, ) == clf.intercept_.shape
    assert (2, 10, 3) == clf.mse_path_.shape
    assert (2, 10) == clf.alphas_.shape

    X, y, _, _ = build_dataset(n_targets=3)
    clf = MultiTaskLassoCV(n_alphas=10, eps=1e-3, max_iter=100, tol=1e-3, cv=3)
    clf.fit(X, y)
    assert (3, X.shape[1]) == clf.coef_.shape
    assert (3, ) == clf.intercept_.shape
    assert (10, 3) == clf.mse_path_.shape
    assert 10 == len(clf.alphas_)
Пример #7
0
    def select_mtelastic(self, X, y):
        # MultiTaskElasticCV from sklearn used to determine best alpha for Multi-task Elastic-Net Regression.

        mtlasso_alphas = MultiTaskElasticNetCV(alphas=[
            0.00001, .0001, .001, .002, .003, .004, .005, .006, .007, .008,
            .009, .099, .01, .011, .012, .013, .014, .015, .016, .017, .018,
            .019, .02, .025, .026, .027, .028, .029, .03, .031, .032, .033,
            .034, .035, .036, .037, .038, .039, .04, .041, .042, .043, .044,
            .045, .05, .06, .07, .071, .072, .073, .074, .075, .076, .077,
            .078, .079, .08, .1, .2, .225, .23, .24, .245, .246, .247, .248,
            .249, .25, .251, .252, .253, .254, .255, .26, .27, .275, .3, .35,
            .4, .45, .46, .47, .48, .481, .482, .483, .484, .485, .486, .487,
            .488, .489, .49, .491, .492, .493, .494, .495, .496, .497, .498,
            .499, .5, .51, .511, .512, .513, .514, .515, .516, .517, .518,
            .519, .52, .525, .53, .54, .55, .6, .75, .752, .7527, .7528, .7529,
            .753, .7531, .754, .7545, .755, .756, .76, .765, .77, .78, .79, .8,
            .9, 1.0, 1.2, 1.25, 1.5, 1.75, 2.0
        ])

        sel_alpha = mtlasso_alphas.fit(X, y)
        sel_alpha.alpha_
        print(sel_alpha.alpha_)
Пример #8
0
#把离散特征和连续特征拼接起来
x_vec = np.concatenate((x_vec_con, x_vec_dis), axis=1)

#对于目标进行预测
y_registered = bike_rel['registered'].values.astype(float)
y_casual = bike_rel['casual'].values.astype(float)

y = np.stack((y_registered, y_casual), axis=1)

#建立模型进行预测
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import MultiTaskElasticNetCV
x1, x2, y1, y2 = train_test_split(x_vec, y, test_size=0.2, random_state=20)

############ Lasso
mtl = MultiTaskLassoCV(alphas=np.logspace(-3, -1, 3), cv=8, verbose=3)
mtl.fit(x1, y1)
mtl.score(x1, y1)
mtl.score(x2, y2)

############ ElasticNetCV
mte = MultiTaskElasticNetCV(l1_ratio=np.logspace(-3, -1, 3),
                            alphas=np.logspace(-3, -1, 3),
                            cv=8,
                            verbose=3)
mte.fit(x1, y1)
mtl.score(x1, y1)
mtl.score(x2, y2)
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=1)

folds = 5
alphas = np.logspace(1, 5, 3)
l1_ratios = np.linspace(0, 1, 2, endpoint=True)

models = MultiTaskElasticNetCV(l1_ratio=l1_ratios,
                               alphas=alphas,
                               verbose=1,
                               cv=folds,
                               n_jobs=-1)
models.fit(X_train, Y_train)
models.score(X_test, Y_test)

print "Alpha: ", models.alpha_
print "L1 ratio: ", models.l1_ratio_
print "Score of Elastic-net on test data: ", models.score(X_test, Y_test)

model_EN = ElasticNet(l1_ratio=models.l1_ratio_, alpha=models.alpha_)
model_EN.fit(np.concatenate((X_train, X_test)),
             np.concatenate((Y_train, Y_test)))

test = np.rint(models.predict(X_test)).astype('int16')
coeff = model_EN.coef_.T
# coeff = models.coef_.T

# high=1.0
Пример #10
0
    def scorer(pipe, X, y):
        pred = pipe.predict(X)
        return metrics.f1_score(y, pred)

    accum = np.zeros((X.shape[1],))
    for y in np.transpose(Y):
        selector = SelectKBest(f_classif, selectedFeaureNum)
        selector = selector.fit(X, y)
        accum += selector.pvalues_
    selectedIndices = accum.argsort()[:selectedFeaureNum]
    def transform(X):
        return X[:, selectedIndices]     
    X_filtered, X_test_filtered =  transform(X), transform(X_test)
    clf = MultiTaskElasticNetCV(normalize=True)
    #clf = MultiTaskLasso(normalize=True)
    clf.fit(X_filtered, Y)
    predTrain = np.array(clf.predict(X_filtered))
    splits = []
    for col in range(predTrain.shape[1]):
        bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col])
        splits.append(bestSplit)
    pred =  np.array(clf.predict(X_test_filtered))
    for col in range(pred.shape[1]):
        pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]]
        predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]]
    ps.append(metrics.precision_score(Y_test, pred))
    rs.append(metrics.recall_score(Y_test, pred))
    teF  = metrics.f1_score(Y_test, pred)
    teFs.append(teF)
    trFs.append(metrics.f1_score(Y, predTrain))
    print 'test#: ', test
Пример #11
0
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV

#cross-validating to find best hyperparams
cv_model = MultiTaskElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                 verbose=1)
cv_model.fit(X_train, y_train)

#fitting model with hyperparameters from above
model = MultiTaskElasticNet(alpha=cv_model.alpha_,
                            l1_ratio=cv_model.l1_ratio_,
                            random_state=0)
model.fit(X_train, y_train)

#predicting
preds = model.predict(X_test)
test_df[[
    'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'
]] = preds
test_df.drop(columns=["is_train"], inplace=True)
test_df.head()

#predictions housekeeping
sub_df = cudf.melt(test_df[[
    "Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
]],
                   id_vars=["Id"],
                   value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" + sub_df["variable"].astype(
    "str")
sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0] * 5
Пример #12
0
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200,
                                         n_features=100,
                                         n_informative_features=100)
    max_iter = 150

    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests
    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1],
                       eps=2e-3,
                       l1_ratio=[0.5, 0.7],
                       cv=3,
                       max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert clf.l1_ratio_ == min(clf.l1_ratio)

    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1],
                       eps=2e-3,
                       l1_ratio=[0.5, 0.7],
                       cv=3,
                       max_iter=max_iter,
                       precompute=True)
    ignore_warnings(clf.fit)(X, y)

    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert clf.l1_ratio_ == min(clf.l1_ratio)

    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert clf.score(X_test, y_test) > 0.99

    # Multi-output/target case
    X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=5,
                                eps=2e-3,
                                l1_ratio=[0.5, 0.7],
                                cv=3,
                                max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert clf.score(X_test, y_test) > 0.99
    assert clf.coef_.shape == (3, 10)

    # Mono-output should have same cross-validated alpha_ and l1_ratio_
    # in both cases.
    X, y, _, _ = build_dataset(n_features=10)
    clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf2.fit(X, y[:, np.newaxis])
    assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_)
    assert_almost_equal(clf1.alpha_, clf2.alpha_)
    used_mets=[]
    for mm in g2:
        reacs=[react_dict[z] for z in mm]
        m=Model(reacs)
        used_mets.append(m.ex_reactants)
    used_mets = list(chain.from_iterable(used_mets))
    mf=[]
    for mm in dm:
        mf.append(used_mets.count(mm)/len(g2))
    true_used_env.append(mf)
    
    from sklearn.linear_model import MultiTaskElasticNetCV as EN
    enet  = EN(cv=50, max_iter=100000)
    x = full_freq_m.T[m_diff_freq_m>.005].T
    y = used_environment.T[m_diff_used_env>0.005].T
    mod=enet.fit(x, y)
    p = mod.predict(f2[m_diff_freq_m>.005].reshape(1,-1))
    
    
    p=p.flatten()
    p = p+abs(min(p))
    p=p/max(p)
    
    c = [sts.pearsonr(mf,used_environment[ee][m_diff_used_env>0.005])[0] for ee in range(len(used_environment))]
    
    
    predicted.append(sts.pearsonr(p, mf)[0])
    
    average.append(mean(c))

    predicted_environments.append(p)
Пример #14
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 21 23:51:12 2016

@author: patanjali
"""

from sklearn.linear_model import MultiTaskElasticNetCV
from utils2 import load_dataset
import pandas

train, validate, test = load_dataset()

no_classes = train[:,0].max()+1
train_y = pandas.get_dummies(train[:,0])

print no_classes, train.shape

train = train[:201]
validate = validate[:201]
test = test[:201]

for l1_ratio in [.1, .5, .7, .9, .95, .99, 1]:
    
    model = MultiTaskElasticNetCV(l1_ratio=l1_ratio, normalize=True, verbose=True, n_jobs=3)
    model.fit(train[:,1:], train_y)
    predicted_classes = (model.predict(validate[:,1:])).argmax(1)
    
    correct = sum(predicted_classes==validate[:,0])
    print l1_ratio, correct, correct*1.0/validate.shape[0]
    
# the parameters below are new in sklearn 0.18
feature_names=['petal length', 'petal width'],  
class_names=['setosa', 'versicolor', 'virginica'],  
filled=True,
rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data)  
display(Image(graph.create_png()))

export_graphviz(tree, 
                out_file='tree.dot', 
                feature_names=['petal length', 'petal width'])
Image(filename='./images/03_18.png', width=600)



from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)
clf = clf.fit(iris.data, iris.target)

with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

import pydotplus 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("iris.pdf") 

Пример #16
0
class PhonesthemesModel(object):
    """
    Attributes
    ----------
    self.config: Dict
        A dictionary of the arguments passed into the object.

    self.ngrams: List[int]
        A list of integers that refer to the ngram sizes to use.

    self.mode: List[str]
        List of str indicating the positions in the word to use
        as candidate phonesthemes. Possible elements are "start",
        "end", and "all".

    self.min_count: int
        Minimum number of ngram occurrences in order to be included
        as a features.

    self.one_hot: bool
        Whether or not to use one-hot features instead of counts for
        the phonestheme ngram features.

    self.vectors
        Dictionary of word to vector, where word is either a string or a
        tuple of strings (phoneme representation).

    self.phonesthemes_reg
        The MultiTaskElasticNetCV model fit on the phonestheme feature vectors to
        predict the phonestheme targets.

    self.X_ngram
        The input feature vectors used to fit the Elastic Net.

    self.ngram_to_idx
        A mapping from ngram to feature index of X_ngram.

    self.is_trained
        A boolean describing whether this model has been trained or not.
    """
    def __init__(self, ngrams, mode, min_count, one_hot):
        self.config = locals()
        self.config.pop("self")
        self.config.pop("__class__", None)

        logger.info("Config: ")
        pprint.pprint(self.config)

        self.ngrams = ngrams
        self.mode = mode
        self.min_count = min_count
        self.one_hot = one_hot

        # Placeholder values, these get set when we call train
        self.vectors = None
        self.phonesthemes_reg = None
        self.X_ngram = None
        self.ngram_to_idx = None
        self.phonemes_to_graphemes = None

        self.is_trained = False

    def get_phonesthemes(self):
        return get_phonesthemes_from_model(self)

    def train(self,
              vectors_path,
              bound_morphemes_path=None,
              word_segmentations_path=None,
              graphemes_to_phonemes_path=None,
              n_jobs=1,
              l1_ratio=0.5):
        train_config = locals()
        train_config.pop("self")
        train_config.pop("__class__", None)
        self.config["train_config"] = train_config
        logger.info("Train config: ")
        pprint.pprint(train_config)

        # Load vectors, where the keys can be words represented as
        # sequences of characters (normal word vectors) or words represented
        # as sequences of phonemes (phonemicized vectors).
        logger.info("Reading vectors from {}".format(vectors_path))
        self.vectors = OrderedDict()
        with open(vectors_path) as vectors_file:
            for line in tqdm(vectors_file,
                             total=get_line_number(vectors_path)):
                split_line = line.rstrip("\n").split()
                word = split_line[0]
                # If we have phonemicized vectors, the keys to the dict are
                # tuples of comma-separated phonemes representing a word.
                if graphemes_to_phonemes_path is not None:
                    word = tuple(word.split(","))
                embedding = np.array([float(val) for val in split_line[1:]])
                self.vectors[word] = embedding

        # Randomly shuffle the OrderedDict
        random_seed = 0
        logger.info(
            "Shuffling vectors with random seed {}".format(random_seed))
        random.seed(random_seed)
        vector_items = list(self.vectors.items())
        # random.shuffle is in-place
        random.shuffle(vector_items)
        self.vectors = OrderedDict(vector_items)

        vocabulary = list(self.vectors.keys())
        targets = np.asarray(list(self.vectors.values()))

        # Load phonemes to graphemes if we were given g2p data
        if graphemes_to_phonemes_path:
            logger.info("Reading graphemes to phonemes data "
                        "from {}".format(graphemes_to_phonemes_path))
            self.phonemes_to_graphemes = {}
            # Load the graphemes to phonemes data
            with open(
                    graphemes_to_phonemes_path) as graphemes_to_phonemes_file:
                for line in tqdm(
                        graphemes_to_phonemes_file,
                        total=get_line_number(graphemes_to_phonemes_path)):
                    split_line = line.rstrip("\n").split("\t")
                    word = split_line[0]
                    phonemes = tuple(split_line[1].split(" "))
                    self.phonemes_to_graphemes[phonemes] = word

        if bound_morphemes_path is not None:
            # Load morpheme data if we were given bound morphemes
            word_segmentations, bound_morphemes = self._load_morpheme_data(
                word_segmentations_path, bound_morphemes_path)
            # Update targets with predictions of the morpheme model. This is equivalent
            # to using the model residuals as the new targets.
            targets = self._get_morpheme_residuals(vocabulary,
                                                   targets,
                                                   bound_morphemes,
                                                   graphemes_to_phonemes_path,
                                                   word_segmentations,
                                                   n_jobs=n_jobs)

        # Get the ngram features for the vocabulary.
        self.X_ngram, self.ngram_to_idx = build_ngram_features(
            vocabulary=vocabulary,
            one_hot=self.one_hot,
            ngram_range=self.ngrams,
            mode=self.mode,
            freq_thres=self.min_count)
        logger.info("Shape of ElasticNet input (number of words, "
                    "number of candidate phonesthemes): {}".format(
                        self.X_ngram.shape))
        logger.info("Shape of ElasticNet targets (number of words, "
                    "vector dimension): {}".format(targets.shape))
        # Fit a MultiTaskElasticNetCV model to extract phonesthemes.
        logger.info("Fitting MultiTaskElasticNetCV")
        self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio,
                                                      n_jobs=n_jobs,
                                                      random_state=0,
                                                      cv=5)
        self.phonesthemes_reg.fit(self.X_ngram, targets)
        logger.info("Done fitting MultiTaskElasticNetCV")

        self.is_trained = True

    def _load_morpheme_data(self, word_segmentations_path,
                            bound_morphemes_path):
        # Load word segmentations
        word_segmentations = {}
        if word_segmentations_path:
            logger.info("Loading word segmentations from {}".format(
                word_segmentations_path))
            with open(word_segmentations_path) as word_segmentations_file:
                for line in tqdm(
                        word_segmentations_file,
                        total=get_line_number(word_segmentations_path)):
                    split_line = line.rstrip("\n").split("\t")
                    assert len(split_line) == 2
                    word = split_line[0]
                    morphemes = split_line[1].split(" ")
                    word_segmentations[word] = morphemes
            logger.info("Loaded {} word segmentations".format(
                len(word_segmentations)))

        # Load the list of bound morphemes
        logger.info(
            "Loading bound morphemes from {}".format(bound_morphemes_path))
        bound_morphemes = []
        with open(bound_morphemes_path) as bound_morphemes_file:
            for line in tqdm(bound_morphemes_file,
                             total=get_line_number(bound_morphemes_path)):
                bound_morphemes.append(line.rstrip("\n"))
        logger.info("Loaded {} bound morphemes".format(len(bound_morphemes)))
        return (word_segmentations, bound_morphemes)

    def _get_morpheme_residuals(self,
                                vocabulary,
                                targets,
                                bound_morphemes,
                                graphemes_to_phonemes_path,
                                word_segmentations=None,
                                n_jobs=1):
        # Get the vectors vocabulary, and convert to string if we are using
        # phonemicized vectors.
        if graphemes_to_phonemes_path is None:
            string_vectors_vocab = vocabulary
        else:
            # The vocab of the phonemicized vectors converted to graphemes.
            string_vectors_vocab = [
                self.phonemes_to_graphemes[phonemes] for phonemes in vocabulary
            ]
        # Build the morpheme feature vectors.
        morpheme_features = build_morpheme_features(string_vectors_vocab,
                                                    bound_morphemes,
                                                    word_segmentations)
        logger.info("Input shape for morpheme pretraining linear regression "
                    "(number of words, number of morphemes): {}".format(
                        morpheme_features.shape))
        logger.info("Target shape for morpheme pretraining linear regression "
                    "(number of words, vector dimension): {}".format(
                        targets.shape))
        morph_reg = LinearRegression(n_jobs=n_jobs)
        logger.info("Pretraining on morpheme features.")
        morph_reg = morph_reg.fit(morpheme_features, targets)
        logger.info("Calculating residuals of of linear regression done "
                    "on morpheme features and using that as the train "
                    "vectors for the ngram feature model.")

        # Get the residuals of the model for use in the second model.
        morph_reg_pred_y = morph_reg.predict(morpheme_features)
        morph_reg_residuals = np.subtract(targets, morph_reg_pred_y)
        return morph_reg_residuals

    def __eq__(self, other):
        # Two PhonesthemesModel objects are the same if their members are
        # the same.
        # Compare their ngrams
        if self.ngrams != other.ngrams:
            return False
        # Compare their mode
        if self.mode != other.mode:
            return False
        # Compare their min count
        if self.min_count != other.min_count:
            return False
        # Compare whether they use one-hot or frequency features
        if self.one_hot != other.one_hot:
            return False
        # Compare that they have the same set of vectors in the same order
        if len(self.vectors) != len(other.vectors):
            return False
        for this_word, other_word in zip(self.vectors, other.vectors):
            if this_word != other_word:
                return False
            if not np.allclose(self.vectors[this_word],
                               other.vectors[this_word]):
                return False
        # Check that they were trained on the same features
        if not np.allclose(self.X_ngram, other.X_ngram):
            return False
        # Check that they have the same mapping of ngram to feature idx
        if self.ngram_to_idx != other.ngram_to_idx:
            return False
        return True

    if six.PY2:

        def __ne__(self, other):
            equal = self.__eq__(other)
            return equal if equal is NotImplemented else not equal
def main(family, quantile_ass=.99):
    data_folder = os.path.join(Path(os.getcwd()).parents[1], 'data')
    #load a pickle generated from "associate_env.py script"
    store = pickle.load(open(data_folder + '/pickles/' + family + '.pkl', 'rb'))
    
    used_environment = store['used_env'].copy()
    
    full_freq_m = store['full_freq_m'].copy()
    
    reactome = store['reactome'].copy()
    model_sample = store['model_sample'].copy()
    transporter = store['transporter'].copy()
    
    #replace nan values by the average
    av_used_env  = np.nanmean(used_environment,0)
    inds = np.where(np.isnan(used_environment))
    used_environment[inds] = np.take(av_used_env, inds[1])


    #for reaction frequency    
       
    env_driv_reac_score = get_residual_scores(full_freq_m)
    
    reac_cutoff = np.std(env_driv_reac_score)
    
    env_driven_reactome = reactome[env_driv_reac_score>reac_cutoff]
    reaction_frequency = full_freq_m.T[env_driv_reac_score>reac_cutoff].T

    clss_freq_m = get_residuals(reaction_frequency)

        
    
    
    #for the environment
    
    
       
    env_driv_met_score = get_residual_scores(used_environment)
    
    met_cutoff = np.std(env_driv_met_score)
        
    
    driving_mets = transporter[env_driv_met_score>met_cutoff]
    used_env = used_environment.T[env_driv_met_score>met_cutoff].T

    clss_used_env = get_residuals(used_env)
     
    
       
    
    #regression terms
    x=reaction_frequency.copy()
    y = used_env.copy()
                  
    cosine_dict={}
    
    for i, reac in enumerate(clss_freq_m.T):
        cosine_dict[env_driven_reactome[i]] = np.array([cosine(reac.flatten(), metab.flatten()) for metab in clss_used_env.T])
    
    
    cosine_v = np.array([cosine_dict[i] for i in envd_reactions])
    
  
    #find metabolite concentrations for models
    from sklearn.linear_model import MultiTaskElasticNetCV as EN
    enet  = EN(cv=3,verbose=1, n_jobs=7, max_iter=10000)
    print(x.shape, y.shape)
    mod=enet.fit(x, y)
    evolved_env= np.zeros((len(model_sample), len(dm)))
    
    for i,mod_prof in enumerate(model_sample):
        #print(family, i)
        v = mod_prof[env_driv_reac_score>0]
        
        p = mod.predict(v[s_clss_fm!=0].reshape(1,-1))
        p=p.flatten()
        p = p+abs(min(p))
        p=p/max(p)
        evolved_env[i] =p.copy()
    
    #av_mod_diff = np.arctanh(av_mod_diff)
    met_prof = get_evolved_met_prof(evolved_env, dm, transporter)
        
    return transporter, met_prof
Пример #18
0
def main(family, quantile_ass=.99):
    data_folder = os.path.join(Path(os.getcwd()).parents[1], 'data')
    #load a pickle generated from "associate_env.py script"
    store = pickle.load(open(data_folder + '/pickles/' + family + '.pkl',
                             'rb'))

    used_environment = store['used_env'].copy()

    full_freq_m = store['full_freq_m'].copy()

    reactome = store['reactome'].copy()
    model_sample = store['model_sample'].copy()
    transporter = store['transporter'].copy()

    #replace nan values by the average
    av_used_env = np.nanmean(used_environment, 0)
    inds = np.where(np.isnan(used_environment))
    used_environment[inds] = np.take(av_used_env, inds[1])

    #for reaction frequency
    av_freq_m = np.mean(full_freq_m, axis=0)
    diff_freq_m = full_freq_m - av_freq_m

    #filter out noise and find reactions that are driven by the environment
    env_d_score1 = np.round(np.max(diff_freq_m, axis=0), 4)
    env_d_score1 = env_d_score1 / max(np.abs(env_d_score1))
    env_d_score2 = np.round(np.min(diff_freq_m, axis=0), 4)
    env_d_score2 = env_d_score2 / max(np.abs(env_d_score2))
    env_d_score = np.zeros(len(env_d_score1))
    for i in range(len(env_d_score1)):
        if abs(env_d_score2[i]) > abs(env_d_score1[i]):
            env_d_score[i] = env_d_score2[i]
        else:
            env_d_score[i] = env_d_score1[i]

    m_diff_freq_m = np.abs(env_d_score)

    env_driven_reactome = reactome  #[m_diff_freq_m>.005]
    diff_freq_m_envd = diff_freq_m.T  #[m_diff_freq_m>.005].T
    reaction_frequency = full_freq_m.T  #[m_diff_freq_m>.005].T

    clss_freq_m = np.zeros(diff_freq_m_envd.shape)
    for i, v in enumerate(diff_freq_m_envd):
        clss_freq_m[i] = v  #assign_to_rank(v, fpc,fnc)

    #for the environment
    av_used_env = np.mean(used_environment, axis=0)

    diff_used_env = used_environment - av_used_env

    #filter out noise and find metabolites that are driven by the environment
    m_diff_used_env = np.max(np.abs(diff_used_env), axis=0)
    driving_mets = transporter  #[m_diff_used_env>0.005]
    diff_used_env_envd = diff_used_env.T  #[m_diff_used_env>0.005].T
    used_env = used_environment.T  #[m_diff_used_env>0.005].T
    clss_used_env = np.zeros(diff_used_env_envd.shape)

    for i, v in enumerate(diff_used_env_envd):
        clss_used_env[i] = v  #assign_to_rank(v, epc, enc)

    s_clss_fm = np.sum(np.abs(clss_freq_m), axis=0)
    s_clss_ue = np.sum(np.abs(clss_used_env), axis=0)

    #env_driven_reactome

    envd_reactions = env_driven_reactome[s_clss_fm != 0]

    #driving_metabolites
    dm = driving_mets.copy()
    dm = dm[s_clss_ue != 0]

    #profiles
    envd_prof = clss_freq_m.T[s_clss_fm != 0].T
    dm_prof = clss_used_env.T[s_clss_ue != 0].T

    #regression terms
    x = reaction_frequency.T[s_clss_fm != 0].T
    y = used_env.T[s_clss_ue != 0].T

    cosine_dict = {}

    for i, reac in enumerate(envd_prof.T):
        cosine_dict[envd_reactions[i]] = np.array(
            [cosine(reac.flatten(), metab.flatten()) for metab in dm_prof.T])

    cosine_pool = np.array(list(cosine_dict.values())).flatten()
    pc = np.quantile(cosine_pool[cosine_pool > 0], quantile_ass)
    nc = np.quantile(cosine_pool[cosine_pool < 0], 1 - quantile_ass)
    association_d = {}

    for i, reac in enumerate(envd_prof.T):
        v = cosine_dict[envd_reactions[i]]

        association_d[envd_reactions[i]] = assign_to_rank(v, pc, nc)

    g = build_association_network(association_d, envd_reactions, dm)
    nx.write_graphml(
        g,
        os.path.join(
            Path(os.getcwd()).parents[0], 'files', 'networks', family) +
        '.graphml')

    #find metabolite concentrations for models
    from sklearn.linear_model import MultiTaskElasticNetCV as EN
    enet = EN(cv=3, verbose=1, n_jobs=7, max_iter=10000)
    print(x.shape, y.shape)
    mod = enet.fit(x, y)
    evolved_env = np.zeros((len(model_sample), len(dm)))

    for i, mod_prof in enumerate(model_sample):
        print(family, i)
        v = mod_prof[m_diff_freq_m > .005]

        p = mod.predict(v[s_clss_fm != 0].reshape(1, -1))
        p = p.flatten()
        p = p + abs(min(p))
        p = p / max(p)
        evolved_env[i] = p.copy()

    #av_mod_diff = np.arctanh(av_mod_diff)
    met_prof = get_evolved_met_prof(evolved_env, dm, transporter)

    return transporter, met_prof
Пример #19
0
def train_linear_model(X, y, random_state=1, test_size=0.2,
                       regularization_type='elasticnet', k_fold=5,
                       max_iter=1000000, tol=0.0001,
                       l1_ratio=None):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [.1, .5, .7, .9, .95, 1]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {'random_state': random_state,
                       'test_size': test_size,
                       'k_fold': k_fold,
                       'tol': tol,
                       'max_iter': max_iter
                       }
    if regularization_type == 'lasso' and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol,
                          cv=k_fold, max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True, alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters['l1_ratio'] = 1

    elif regularization_type == 'ridge' and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = 0

    elif regularization_type == 'elasticnet' and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False,
                                    alphas=None, cv=k_fold,
                                    l1_ratio=l1_ratio, max_iter=max_iter)
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(fit_intercept=True, normalize=False,
                                  l1_ratio=l1_ratio_opt,
                                  alpha=alpha_opt, max_iter=max_iter)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == 'elasticnet' and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold,
                                                    normalize=False,
                                                    l1_ratio=l1_ratio, max_iter=max_iter)
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test-mu)/s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape),
                                                                pred_actual_ratio,
                                                                multioutput='raw_values')/y_pred.shape[0])
    hyperparameters['alpha'] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
Пример #20
0
p(mean_squared_error(lasso_predict, Y_test))

# ## Ridge
#

# In[25]:

ridge_model = Ridge(alpha=0.01)
ridge_model = ridge_model.fit(X=X_train, y=Y_train)

ridge_predict = ridge_model.predict(X_test)

p(mean_absolute_error(ridge_predict, Y_test))
p(mean_squared_error(ridge_predict, Y_test))

# ## Elastic Net

# In[27]:

enet_params = {
    'alpha': [1e-7],
}

enet_model = MultiTaskElasticNetCV(alphas=enet_params['alpha'])
enet_model = enet_model.fit(X=X_train, y=Y_train)

enet_predict = enet_model.predict(X_test)

p(mean_absolute_error(enet_predict, Y_test))
p(mean_squared_error(enet_predict, Y_test))
lastX = np.zeros((X_raw.shape[0], hiddenSize))

for  i in range(epochs/quanta):
    print 'Epoch: ', i*quanta
    an.trainSupervised(quanta, trndata,
        initialLearningrate=learningrate, 
        decay=1,#0.999,
        myWeightdecay=weightDecay,
        momentum=momentum)
    netTrainFs.append(an.scoreOnDS(trndata))    
    X, X_test = an.transform(X_raw),  an.transform(X_test_raw)
    if (lastX == X).all():
        raise 'problem'
    lastX = copy.deepcopy(X)
    clf = MultiTaskElasticNetCV()
    clf.fit(X, Y)
    predTrain = np.array(clf.predict(X))
    splits = []
    for col in range(predTrain.shape[1]):
        bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col])
        splits.append(bestSplit)
    pred =  np.array(clf.predict(X_test))
    for col in range(pred.shape[1]):
        pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]]
        predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]]
    
    testFs.append(metrics.f1_score(Y_test, pred))
    trainFs.append(metrics.f1_score(Y, predTrain))
#des+='\n EN test f1: '+ str(testF)
#des+=' , EN train f1: '+ str(trainF)
r = range(epochs/quanta)
Пример #22
0
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, multiTaskElasticNet.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试MultiTaskElasticNetCV类**********"
    # 在初始化MultiTaskElasticNetCV类时, 提供一组备选的α值, MultiTaskElasticNetCV类会帮我们选择一个合适的α值.
    multiTaskElasticNetCV = MultiTaskElasticNetCV(
        alphas=[0.01, 0.1, 0.5, 1, 3, 5, 7, 10, 20, 100], cv=5)
    # 拟合训练集
    multiTaskElasticNetCV.fit(train_X, train_Y)
    # 打印最优的α值
    print "最优的alpha值: ", multiTaskElasticNetCV.alpha_
    # 打印模型的系数
    print "系数:", multiTaskElasticNetCV.coef_
    print "截距:", multiTaskElasticNetCV.intercept_
    print '训练集R2: ', r2_score(train_Y, multiTaskElasticNetCV.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = multiTaskElasticNetCV.predict(test_X)
    print "测试集得分:", multiTaskElasticNetCV.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)