Пример #1
0
def create_ml_features_df(data_frame) :
    """Create a DataFrame with non-ticker specific features for machine
    learning.
    
    Features added as new columns:
        - 'Days Since Trading': Days since last trading day
        - 'Days Until Trading': Days until next trading day
        - Various date part columns from fastai.structured.add_datepart

    Args:
        data_frame: DataFrame to add features to.

    Returns:
        A new DataFrame with features added to data_frame. 
    """
    assert isinstance(data_frame, pd.DataFrame), \
        "data_frame must be pandas.DataFrame object"

    data = create_days_since_valid_date(data_frame, 'Days Since Trading')

    # Get the days until next day of trading. Reverse sort data, call same 
    # function as above, then resort in normal order.
    data.sort_index(ascending=False, inplace=True)
    data = create_days_since_valid_date(data, 'Days Until Trading')
    data.sort_index(ascending=True, inplace=True)

    # Add separate columns for various date parts (month, day, etc.)
    data = data.reset_index()
    add_datepart(data, 'Date', drop=False)
    data = data.set_index('Date')
    
    return data
Пример #2
0
def divide_dataframe(df):
    '''
    :param df: dataset we want to split into training and test data
    :return : X_train, X_test, y_train, y_test
    '''
    # Drop the date column
    # df.drop('Date', axis=1, inplace=True)

    # Converting date into int with fastai
    add_datepart(df, 'Date')

    # Preprocess with fastai
    X, y, nas, mapper = proc_df(df, y_fld='Actions', do_scale=True)
    # print(X.transpose()) # Debugging
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        shuffle=False)
    return {
        'X train': X_train,
        'X test': X_test,
        'y train': y_train,
        'y test': y_test
    }
Пример #3
0
data = df.sort_index(ascending=True, axis=0)
#print(data)
#print(df)

#creating a separate dataset
new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])

for i in range(0,len(data)):
    new_data['Date'][i] = data['Date'][i]
    new_data['Close'][i] = data['Close'][i]

print(new_data)

#create features
from fastai.structured import  add_datepart
add_datepart(new_data, 'Date')
new_data.drop('Elapsed', axis=1, inplace=True)  #elapsed will be the time stamp
print(new_data)

new_data['mon_fri'] = 0
for i in range(0,len(new_data)):
    if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4):
        new_data['mon_fri'][i] = 1
    else:
        new_data['mon_fri'][i] = 0

#split into train and validation
train = new_data[:987]
valid = new_data[987:]

x_train = train.drop('Close', axis=1)
Пример #4
0
import pandas as pd
import numpy as np
from fastai.structured import add_datepart
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import db_functions

scaler = MinMaxScaler(feature_range=(0, 1))

data = db_functions.getDb()
add_datepart(data, 'Date')
data.drop('Elapsed', axis=1, inplace=True)

train = data[:987]
valid = data[987:]

x_train = train.drop('Close', axis=1)
y_train = train['Close']
x_valid = valid.drop('Close', axis=1)
y_valid = valid['Close']

x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled)
x_valid_scaled = scaler.fit_transform(x_valid)
x_valid = pd.DataFrame(x_valid_scaled)

params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
knn = neighbors.KNeighborsRegressor()
model = GridSearchCV(knn, params, cv=5)
newdf.head(5)


# In[ ]:


# Apart from this, we can add our own set of features that we believe would be relevant for the predictions. For instance, my hypothesis is that the first and last days of the week could potentially affect the closing price of the stock far more than the other days. So I have created a feature that identifies
# whether a given day is Monday/Friday or Tuesday/Wednesday/Thursday. This can be done using the following lines of code:


# In[ ]:


#create features
from fastai.structured import  add_datepart
add_datepart(newdf, 'Date')
newdf.drop('Elapsed', axis=1, inplace=True)  #elapsed will be the time stamp


# In[ ]:


#create a new column[mon-fri]:monday to friday
import sys
newdf['mon-fri'] = 0
for i in range(0,len(newdf)):
    if(newdf['Dayofweek'][i]==0 or newdf['Dayofweek'][i]==4):
        newdf['mon-fri'][i] =1
    else:
        newdf['mon-fri'][i] =0