示例#1
0
def agg_data(n_apt, seed):
    # random.seed(seed)
    # split = 28
    # apts = range(1, split) + range(split+1, 115)
    # random.shuffle(apts)
    # apts = apts[:n_apt-1]
    # apts.append(split)

    random.seed(seed)
    apts = range(1, 115)
    random.shuffle(apts)
    apts = apts[:n_apt]

    print('num of apts:', len(apts))
    agg_energy = {}
    for apt in apts:
        print('reading %d ...' % apt)
        agg_energy[apt] = load_energy(apt)

    df = pd.DataFrame(agg_energy)

    def agg_mean(row):
        return np.mean(row)

    df['mean'] = df.apply(agg_mean, axis=1)

    df = df['mean']

    filename = DATA_SET_DIR + 'Mean_seed_%d_apt_%d_2016.pkl' % (seed, n_apt)
    print('saving to file: %s ...' % filename)
    df.to_pickle(filename)
    print('saved.')
示例#2
0
def agg_all_sum(freqs):
    apts = range(1, 115)
    print('# apartments:', len(apts))
    print('freqs:', freqs)

    agg_energy = {}
    for apt in apts:
        print('reading %d ...' % apt)
        agg_energy[apt] = load_energy(apt)
    df = pd.DataFrame(agg_energy)

    for freq in freqs:
        print('freq:', freq)
        df_freq = df.resample(freq).mean()
        df_freq = df_freq.loc[pd.date_range(start='2016-01-01',
                                            end='2016-12-01',
                                            freq=freq)]

        df_freq['sum'] = df_freq.apply(lambda x: np.sum(x), axis=1)

        df_freq = df_freq['sum']

        filename = DATA_SET_DIR + 'SUM_%d_%s_2016.pkl' % (len(apts), freq)
        print('saving to file: %s ...' % filename)
        df_freq.to_pickle(filename)
        print('saved.')
def visualize_error_lasso_alpha():
    dataset = load_energy()

    start = time.time()
    model = linear_model.LassoCV(cv=20)
    model.fit(dataset.data, dataset.target('Y1'))
    delta = time.time() - start

    m_log_alphas = -np.log10(model.alphas_)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
             label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha: CV estimate')

    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: coordinate descent '
              '(train time: %.2fs)' % delta)
    plt.axis('tight')

    plt.show()
def visualize_error_ridge_alpha(n_alphas=200, n_folds=12):
    dataset = load_energy()
    alphas  = np.logspace(-10, -2, n_alphas)
    model   = linear_model.Ridge(fit_intercept=False)
    seed    = random.randint(1, 10000)
    X       = dataset.data
    y       = dataset.target('Y1')

    errors  = np.zeros(shape=(n_alphas, n_folds))
    for idx, alpha in enumerate(alphas):
        model.set_params(alpha=alpha)
        splits = ShuffleSplit(len(y), n_iter=n_folds, test_size=0.2,
                              random_state=seed)

        for jdx, (train, test) in enumerate(splits):
            X_train = X[train]
            y_train = y[train]
            X_test  = X[test]
            y_test  = y[test]

            model.fit(X_train, y_train)
            error = mean_squared_error(y_test, model.predict(X_test))

            errors[idx, jdx] = error

    print errors
    print errors.shape
    print alphas
    print alphas.shape

    plt.figure()
    plt.plot(alphas, errors, ':')

    plt.show()
示例#5
0
def build(args):
    """
    Builds the models from the arguments.
    In a real applciation, would probably arguments:

        - fixtures (where the training data is)
        - model_dir (where to write the models out to)
        - kfolds (number of cross validation folds)

    For now, just write out the pickles to HEAT_MODEL and COLD_MODEL
    """
    start = time.time()

    # Load data and estimator
    dataset = load_energy()
    alphas = np.logspace(-10, -2, 200)

    scores = {}
    for y in ('Y1', 'Y2'):
        # Perform cross validation, don't worry about Imputation here
        clf = linear_model.RidgeCV(alphas=alphas)
        scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12)

        # Get the alpha from the ridge by fitting the entire data set.
        # There are a couple of reasons for this, but mostly to ensure that
        # we get the desired result pickled (e.g. a ridge with alpha)
        clf.fit(dataset.data, dataset.target(y))

        # Build the model on the entire datset include Imputer pipeline
        model = linear_model.Ridge(alpha=clf.alpha_)
        imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
        estimator = Pipeline([("imputer", imputer), ("ridge", model)])
        estimator.fit(dataset.data, dataset.target(y))

        # Dump the model
        jump = {
            'Y1': HEAT_MODEL,
            'Y2': COLD_MODEL,
        }

        with open(jump[y], 'wb') as f:
            pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL)

        msg = ("%s trained on %i instances using a %s model\n"
               "     average R2 score of %0.3f using an alpha of %0.5f\n"
               "     model has been dumped to %s\n")

        print(msg % (
            y,
            len(dataset.data),
            model.__class__.__name__,
            scores[y].mean(),
            clf.alpha_,
            jump[y],
        ))

    build_time = time.time() - start
    return "Build took %0.3f seconds" % build_time
def build(args):
    """
    Builds the models from the arguments.
    In a real applciation, would probably arguments:

        - fixtures (where the training data is)
        - model_dir (where to write the models out to)
        - kfolds (number of cross validation folds)

    For now, just write out the pickles to HEAT_MODEL and COLD_MODEL
    """
    start = time.time()

    # Load data and estimator
    dataset = load_energy()
    alphas  = np.logspace(-10, -2, 200)

    scores = {}
    for y in ('Y1', 'Y2'):
        # Perform cross validation, don't worry about Imputation here
        clf = linear_model.RidgeCV(alphas=alphas)
        scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12)

        # Get the alpha from the ridge by fitting the entire data set.
        # There are a couple of reasons for this, but mostly to ensure that
        # we get the desired result pickled (e.g. a ridge with alpha)
        clf.fit(dataset.data, dataset.target(y))

        # Build the model on the entire datset include Imputer pipeline
        model = linear_model.Ridge(alpha=clf.alpha_)
        imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
        estimator = Pipeline([("imputer", imputer), ("ridge", model)])
        estimator.fit(dataset.data, dataset.target(y))

        # Dump the model
        jump = {
            'Y1': HEAT_MODEL,
            'Y2': COLD_MODEL,
        }

        with open(jump[y], 'wb') as f:
            pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL)

        msg = (
            "%s trained on %i instances using a %s model\n"
            "     average R2 score of %0.3f using an alpha of %0.5f\n"
            "     model has been dumped to %s\n"
        )

        print(msg % (
            y, len(dataset.data), model.__class__.__name__,
            scores[y].mean(), clf.alpha_,
            jump[y],
        ))

    build_time = time.time() - start
    return "Build took %0.3f seconds" % build_time