def day_offset_sampler(window=1): """Assuming observations from dataset are (x, y), compute the density of the y's over the next n days Args: window (int): how many days to use in the density Yield: [(x₁, y₁), (x₂, y₂), ...)]: where ∀xᵢ are part of the same day, and y is a density over the next {offset} days """ buffer = deque() sampler = dataset_sampler(x_format='OneHot', y_format='Ordinal') def add_day(day): # indexed: [newest day][first observation][first element of train pair][date index] if len(buffer) and buffer[-1][0][0][0] == day[0][0]: buffer[-1].append(day) else: buffer.append([day]) while len(buffer) <= window + 1: add_day(next(sampler)) for observation in sampler: add_day(observation) if len(buffer) > window + 1: current_day = buffer.popleft() temp = buffer.pop( ) # the last day only has one element outside the window expected = np.mean([j[1] for i in buffer for j in i], axis=0) buffer.append(temp) yield [(record[0], expected) for record in current_day]
def run_simple(): from models.torch_simple.train import train_simple, test_simple sampler = lambda: dataset_sampler(x_format='OneHot', y_format='Ordinal') params = {'input_size': 25, 'output_size': 4} train_simple(sampler, params) evaluate(*test_simple(sampler, params))
def run_ann(): from models.torch_ann.train import train_ann, test_ann sampler = lambda: dataset_sampler(x_format='OneHot', y_format='Ordinal') params = {'input_size': 25, 'output_size': 4, 'layer_sizes': (100, 20)} train_ann(sampler, params) evaluate(*test_ann(sampler, params))
def day_sampler(): """Assuming observations from dataset are (x, y), group by the day""" sampler = dataset_sampler(x_format='OneHot', y_format='Ordinal') buffer = [next(sampler)] for observation in sampler: if buffer[0][0][0] != observation[0][0]: yield buffer buffer = [observation] buffer.append(observation) yield buffer
def run_lstm(): from models.torch_lstm.train import train_lstm, test_lstm sampler = lambda: dataset_sampler(x_format='OneHot', y_format='Ordinal') params = { 'input_size': 25, 'output_size': 4, 'lstm_hidden_dim': 20, 'lstm_layers': 2, 'batch_size': 1 } train_lstm(sampler, params) evaluate(*test_lstm(sampler, params))
from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier model_specifications = [{ "name": "Decision Tree", "class": DecisionTreeClassifier, "datasource": lambda: dataset_sampler(x_format='OneHot', y_format='Ordinal'), "hyperparameters": { "criterion": ["gini", "entropy"], "splitter": ["best", "random"], "min_samples_split": [2, 3], "min_samples_leaf": [1, 5] } }, { "name": "Neural Network PCA 10pc", "class": MLPClassifier, "datasource": lambda: dataset_sampler( x_format='OneHot', y_format='Ordinal', components=10), "hyperparameters": {
def run_keras(): from models.keras_ann.network import train_keras, test_keras sampler = lambda: dataset_sampler(x_format='OneHot', y_format='OneHot') trainparams = {'epochs': 200, 'batch_size': 10} train_keras(sampler, trainparams) test_keras(sampler)