Python split_data示例，tools.split_data Python示例

示例#1

0

显示文件

文件： lstm.py 项目： carlosejimenez/dbg

    def load_and_split_data(self, x, y, test_ratio, purge_ratio, units, epochs,
                            batch_size):
        self.x_train, x_test, self.y_train, y_test = tools.split_data(
            x, y, test_ratio=test_ratio, purge_ratio=purge_ratio)
        self.y_purge = y[len(self.y_train):len(y) - len(y_test)]
        self.y_train = self.scale_y.fit_transform(
            np.array(self.y_train).reshape(-1, self.output_size))
        self.x_train = self.scale_x.fit_transform(self.x_train)
        # x_test = self.scale_x.transform(x_test)
        self.x_train = np.reshape(self.x_train,
                                  (-1, self.time_steps, self.input_size))
        # x_test = np.reshape(x_test, (-1, self.time_steps, self.input_size))

        self.model.add(
            LSTM(units=units,
                 return_sequences=True,
                 input_shape=(self.time_steps, self.input_size)))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units=units, return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units=units, return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units=units))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(units=self.output_size))
        self.model.compile(optimizer='adam', loss='mean_squared_error')
        hist = self.model.fit(self.x_train,
                              self.y_train,
                              epochs=epochs,
                              batch_size=batch_size)

        return x_test, y_test

示例#2

0

显示文件

文件： model_selection.py 项目： yingdongo/Kaggle-Otto-Group-Product-Classification

def main():
    train=load_data('train.csv')
    lbl_enc = preprocessing.LabelEncoder()
    train['target'] = lbl_enc.fit_transform(train['target'])
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train,y_train=split_data(train,feature_cols)
    clf_scores=clf_score(create_clf(),X_train[feature_cols],y_train)
    print clf_scores
    plt.plot(clf_scores)
    plt.xticks(range(len(clf_scores)), clf_scores.index, fontsize=14, rotation=90)
    plt.show()

示例#3

0

显示文件

文件： models.py 项目： paveldedik/thesis

    def split_data(cls, data, ratio=0.7):
        """Classmethod that splits data into training set and test set.

        :param data: The object containing data.
        :type data: :class:`pandas.DataFrame`.
        :param ratio: What portion of data to include in the training set
            and the test set. :obj:`0.5` means that the data will be
            distributed equaly.
        :type ratio: float
        """
        data = tools.first_answers(data)
        return tools.split_data(data, ratio=ratio)

示例#4

0

显示文件

文件： experiment.py 项目： drewcapener/depression-screening

def get_data(path,
             training_percent=0.7,
             percent_zeroes=0.5,
             n_nonzero_repeat=0,
             shuffle=True,
             normalize=False):
    X, y = extract_data(path, normalize)
    X_train, y_train, X_test, y_test = split_data(
        X,
        y,
        training_percent=training_percent,
        percent_zeroes=percent_zeroes,
        n_nonzero_repeat=n_nonzero_repeat)
    if shuffle:
        X_train, y_train = shuffle_data(X_train, y_train)
        X_test, y_test = shuffle_data(X_test, y_test)
    return X_train, y_train, X_test, y_test

示例#5

0

显示文件

文件： Decision Trees-Continuous-Variables.py 项目： Jilani7/Decision-Tree


# In[149]:


# %pdb
print (" Plotting the Decision Surface of Training Set... ")
t.plot_decision_regions(X[:,feat],Y,clf=dt, res=0.1, cycle_marker=True, legend=1)


# In[150]:


# Split your data into training and test-set... 
# see the documentation of split_data in tools for further information...
Xtrain,Ytrain,Xtest,Ytest=t.split_data(X,Y)

print (" Training Data Set Dimensions=", Xtrain.shape, "Training True Class labels dimensions", Ytrain.shape)
print (" Test Data Set Dimensions=", Xtest.shape, "Test True Class labels dimensions", Ytrain.shape)   


# In[151]:


# Lets train a Decision Tree Classifier on Petal Length and Width
feat=[0,1]
dt=DecisionTree(0.95,5)
dt.train(Xtrain[:,feat],Ytrain)


# In[152]:

示例#6

0

显示文件

文件： parameter_tuning_continue.py 项目： yingdongo/Kaggle-Otto-Group-Product-Classification

def main():
    train=load_data('train.csv')
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train,y_train=split_data(train,feature_cols)
    grid_search(X_train[feature_cols],y_train,get_clfs())

示例#7

0

显示文件

文件： parameter_tuning_continue.py 项目： yingdongo/Kaggle-Otto-Group-Product-Classification

            param_list = list(ParameterGrid(param_grid))
            for i in range(0,len(param_list)):
                   reg=clfs[name]['est'].set_params(**param_list[i])
                   cv=cv_score1(reg,X_train,y)
                   print [cv.mean(),name,param_list[i]]

def grid_search1(X_train,y,clfs):
    for name, clf in clfs.iteritems(): 
        clf = GridSearchCV(clfs[name]['est'], clfs[name]['grid'], n_jobs=16, verbose=1, cv=5)
        clf.fit(X_train,y)
        print clf.score
        print clf.best_score_
        print clf.best_params_

def main():
    train=load_data('train.csv')
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train,y_train=split_data(train,feature_cols)
    grid_search(X_train[feature_cols],y_train,get_clfs())

#if __name__ == '__main__':
#    main()
le = preprocessing.LabelEncoder()
data=load_data('train.csv')
train=data.loc[np.random.choice(data.index,np.around(len(data)*0.5), replace=False)]
le.fit(train['target'])
train['target']=le.transform(train['target'])
feature_cols= [col for col in train.columns if col  not in ['target','id']]
X_train,y_train=split_data(train,feature_cols)
grid_search(X_train[feature_cols],y_train,get_extra_trees())

示例#8

0

显示文件

文件： analysis.py 项目： carlosejimenez/dbg

    percentage_to_evaluate = 0.05

    interval_length = 1
    window_length = 10
    ema_hyper_parameter = 0.2
    k_fold_hyperparameter = 6

    my_df = tools.make_return_df('BMW',
                                 first_day,
                                 yesterday.strftime('%Y-%m-%d'),
                                 interval=interval_length)
    graph_returns(my_df)

    x, y = tools.build_x_y(my_df, window_length, ema_hyper_parameter)

    x, evaluation_set_x, y, evaluation_set_y = tools.split_data(
        x, y, percentage_to_evaluate)

    alphas_ridge = np.arange(0, 1, 0.01)
    alphas_lasso = np.arange(0, 100, 1)

    clf = RidgeCV(np.arange(0.01, 1, 0.01),
                  cv=k_fold_hyperparameter,
                  fit_intercept=False)
    clf.fit(x, y)
    print(f'coefficient is {clf.coef_}')


def perform_arma():
    # ARMA TESTING
    model = sm.tsa.ARMA(my_df['Return'], (10, 5))
    res = model.fit()

示例#9

0

显示文件

    def train_with_lstm(self):
        x_train, y_train, x_test, y_test = split_data(
            self.data, self.input_params['lookback'])
        print('x_train.shape = ', x_train.shape)
        print('y_train.shape = ', y_train.shape)
        print('x_test.shape = ', x_test.shape)
        print('y_test.shape = ', y_test.shape)
        x_train = torch.from_numpy(x_train).type(torch.Tensor)
        x_test = torch.from_numpy(x_test).type(torch.Tensor)
        y_train_lstm = torch.from_numpy(y_train).type(torch.Tensor)
        y_test_lstm = torch.from_numpy(y_test).type(torch.Tensor)
        y_train_gru = torch.from_numpy(y_train).type(torch.Tensor)
        y_test_gru = torch.from_numpy(y_test).type(torch.Tensor)

        model = LSTM(input_dim=self.input_params['input_dim'],
                     hidden_dim=self.input_params['hidden_dim'],
                     output_dim=self.input_params['output_dim'],
                     num_layers=self.input_params['num_layers'])

        criterion = torch.nn.MSELoss(reduction='mean')
        # optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
        # Adam 一种可以替代传统随机梯度下降过程的一阶优化算法，它能基于训练数据迭代地更新神经网络权重
        optimiser = torch.optim.Adam(model.parameters(),
                                     lr=self.input_params['lr'])

        hist = np.zeros(self.input_params['num_epochs'])
        start_time = time.time()
        lstm = []

        # 随机梯度下降
        for t in range(self.input_params['num_epochs']):
            y_train_pred = model(x_train)

            loss = criterion(y_train_pred, y_train_lstm)
            print("Epoch ", t, "MSE: ", loss.item())
            hist[t] = loss.item()

            # 将模型的参数梯度初始化为 0
            optimiser.zero_grad()
            # 反向传播计算梯度
            loss.backward()
            # 更新所有参数
            optimiser.step()

        training_time = time.time() - start_time
        print("Training time: {}".format(training_time))

        # 将标准化后的数据转换为原始数据
        predict = pd.DataFrame(
            self.scaler.inverse_transform(y_train_pred.detach().numpy()))
        original = pd.DataFrame(
            self.scaler.inverse_transform(y_train_lstm.detach().numpy()))

        print(predict)
        fig = plt.figure()
        # 调整子图布局
        fig.subplots_adjust(hspace=0.2, wspace=0.2)

        # 股票价格
        plt.subplot(1, 2, 1)
        ax = sns.lineplot(x=original.index,
                          y=original[0],
                          label="Data",
                          color='royalblue')
        ax = sns.lineplot(x=predict.index,
                          y=predict[0],
                          label="Training Prediction (LSTM)",
                          color='tomato')
        ax.set_title('Stock price', size=14, fontweight='bold')
        ax.set_xlabel("Days", size=14)
        ax.set_ylabel("Cost (USD)", size=14)
        ax.set_xticklabels('', size=10)
        plt.show()

        # # 训练损失
        # plt.subplot(1, 2, 2)
        # print(hist)
        # ax = sns.lineplot(data=hist, color='royalblue')
        # ax.set_xlabel("Epoch", size=14)
        # ax.set_ylabel("Loss", size=14)
        # ax.set_title("Training Loss", size=14, fontweight='bold')
        # fig.set_figheight(6)
        # fig.set_figwidth(16)
        # fig.show()

        # > 数据预测
        # make predictions
        y_test_pred = model(x_test)

        # invert predictions
        # X = scaler.inverse_transform(X[, copy]) 将标准化后的数据转换为原始数据
        y_train_pred = self.scaler.inverse_transform(
            y_train_pred.detach().numpy())
        y_train = self.scaler.inverse_transform(y_train_lstm.detach().numpy())
        y_test_pred = self.scaler.inverse_transform(
            y_test_pred.detach().numpy())
        y_test = self.scaler.inverse_transform(y_test_lstm.detach().numpy())

        # mean_squared_error 均方误差
        trainScore = math.sqrt(
            mean_squared_error(y_train[:, 0], y_train_pred[:, 0]))
        print('Train Score: %.2f RMSE' % (trainScore))
        testScore = math.sqrt(
            mean_squared_error(y_test[:, 0], y_test_pred[:, 0]))
        print('Test Score: %.2f RMSE' % (testScore))
        lstm.append(trainScore)
        lstm.append(testScore)
        lstm.append(training_time)

        # > train and test
        # empty_like 生成和已有数组相同大小，类型的数组
        trainPredictPlot = np.empty_like(self.data)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[self.input_params['lookback']:len(y_train_pred) +
                         self.input_params['lookback'], :] = y_train_pred

        # shift test predictions for plotting
        testPredictPlot = np.empty_like(self.data)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(y_train_pred) + self.input_params['lookback'] -
                        1:len(self.data) - 1, :] = y_test_pred

        original = self.scaler.inverse_transform(
            self.data['Close'].values.reshape(-1, 1))

        predictions = np.append(trainPredictPlot, testPredictPlot, axis=1)
        predictions = np.append(predictions, original, axis=1)
        result = pd.DataFrame(predictions)

        # >> fig
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                go.Scatter(x=result.index,
                           y=result[0],
                           mode='lines',
                           name='Train prediction')))
        fig.add_trace(
            go.Scatter(x=result.index,
                       y=result[1],
                       mode='lines',
                       name='Test prediction'))
        fig.add_trace(
            go.Scatter(
                go.Scatter(x=result.index,
                           y=result[2],
                           mode='lines',
                           name='Actual Value')))
        fig.update_layout(xaxis=dict(showline=True,
                                     showgrid=True,
                                     showticklabels=False,
                                     linecolor='white',
                                     linewidth=2),
                          yaxis=dict(
                              title_text='Close (USD)',
                              titlefont=dict(
                                  family='Rockwell',
                                  size=12,
                                  color='white',
                              ),
                              showline=True,
                              showgrid=True,
                              showticklabels=True,
                              linecolor='white',
                              linewidth=2,
                              ticks='outside',
                              tickfont=dict(
                                  family='Rockwell',
                                  size=12,
                                  color='white',
                              ),
                          ),
                          showlegend=True,
                          template='plotly_dark')

        annotations = []
        annotations.append(
            dict(xref='paper',
                 yref='paper',
                 x=0.0,
                 y=1.05,
                 xanchor='left',
                 yanchor='bottom',
                 text='Results (LSTM)',
                 font=dict(family='Rockwell', size=26, color='white'),
                 showarrow=False))
        fig.update_layout(annotations=annotations)

        fig.show()
        #   py.iplot(fig, filename='stock_prediction_lstm')
        return lstm