def train(ticker): data = load_processed_data(ticker) data = data.loc[:"2020-12-31"] data = data.iloc[:-1] X_train, X_test, y_train, y_test = split(data) model = train_model(X_train, X_test, y_train, y_test, params) return model
def get_training_data(ticker): data = load_processed_data(ticker) data = data.loc[:"2020-12-31"] data = data.iloc[:-1] data = data[['CLOSE']] # Pre-process log_data = np.log(data[['CLOSE']]) log_data = np.array(log_data['CLOSE']) diff_log_data = np.diff(log_data) return diff_log_data
def get_training_data(ticker: str) -> Tuple[pd.DataFrame, pd.Series]: """ Loads the processed data corresponding to the input ticker. Removes any rows that are supposed to be used for testing (i.e. data from Jan 2021 onwards). Splits the training data into predictors and labels """ data = load_processed_data(ticker) data = data.loc[:"2020-12-31"] # Drop last row because it would have had 2021's first day's change in price data = data.iloc[:-1] predictors, labels = data.drop("LABELS", axis=1), data["LABELS"] return predictors, labels
def build_arima(): futures_list = get_futures_list(filter_insignificant_lag_1_acf=True) for ticker in futures_list: print(f"{futures_list.index(ticker) + 1}/{len(futures_list)}: {ticker}") data = load_processed_data(ticker) data = data.loc[:"2020-12-31"] data = data.iloc[:-1] data = data[['CLOSE']] arima_model = pmdarima.auto_arima(data) arima_model = arima_model.fit(data) p, d, q = arima_model.order arima_residuals = arima_model.arima_res_.resid params = {"p": p, "q": q, "d":d, "residuals": list(arima_residuals)} # Save model dire = f"./models/arima/param2/" os.makedirs(os.path.dirname(dire), exist_ok=True) with open(f'{dire}/{ticker}_params.txt', 'w') as f: json.dump(params, f, ensure_ascii=False) print(f"Saved parameters for {ticker}")
def __init__(self, genre, level, data_type, batch_size, elmocache: ELMoCache, shuffle=True, return_data=False, return_features=False, return_label=True): """ :param elmocache: instance of ELMoCache, used to genrate elmo embedding """ self.input_data = load_processed_data(genre, level, data_type) self.input_premise = self.input_data['premise'] self.input_hypothesis = self.input_data['hypothesis'] self.input_label = self.input_data['label'] assert self.input_hypothesis.shape[0] == self.input_hypothesis.shape[0] == self.input_label.shape[0] self.data_size = self.input_hypothesis.shape[0] self.indexes = np.arange(self.data_size) self.batch_size = batch_size self.elmocache = elmocache self.shuffle = shuffle self.return_data = return_data # whether to return original data self.return_features = return_features # whether to return additional statistical features self.return_label = return_label # whether to return label if self.return_features: self.features = load_features(genre, data_type)
from models.xgboost.training_util import train_model from systems.systems_util import get_futures_list from utils.data_loader import load_processed_data # Cross validation ts_crossval = TimeSeriesSplit(n_splits=5) # Define search space for bayesian optimisation XGB_param_hyperopt = { "booster": hp.choice("booster", ["gblinear"]), "max_depth": scope.int(hp.quniform("max_depth", 1, 5, 1)), "subsample": hp.uniform("subsample", 0.4, 0.6), "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 0.6), "colsample_bynode": hp.uniform("colsample_bynode", 0.4, 0.6), "colsample_bylevel": hp.uniform("colsample_bylevel", 0.4, 0.6), "gamma": hp.uniform("gamma", 0, 10), "min_child_weight": hp.uniform("min_child_weight", 1.5, 2.3), "n_estimators": 100, "reg_lambda": hp.uniform("reg_lambda", 1, 8), "reg_alpha": hp.uniform("reg_alpha", 0, 0.02), } futures = get_futures_list(filter_insignificant_lag=1) # Pre-train models and save the weights for ticker in futures: data = load_processed_data(ticker) data = data.loc[:"2020-12-31"] data = data.iloc[:-1] train_model(data, XGB_param_hyperopt, ts_crossval, ticker)
def predict_dl_model(data_type, variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, checkpoint_dir=None, return_proba=True, **kwargs): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir config.exp_name = '{}_{}_{}_{}_{}'.format( variation, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') print('Logging Info - Experiment: ', config.exp_name) if model_name == 'bilstm': model = BiLSTM(config, **kwargs) elif model_name == 'cnnrnn': model = CNNRNN(config, **kwargs) elif model_name == 'dcnn': model = DCNN(config, **kwargs) elif model_name == 'dpcnn': model = DPCNN(config, **kwargs) elif model_name == 'han': model = HAN(config, **kwargs) elif model_name == 'multicnn': model = MultiTextCNN(config, **kwargs) elif model_name == 'rcnn': model = RCNN(config, **kwargs) elif model_name == 'rnncnn': model = RNNCNN(config, **kwargs) elif model_name == 'cnn': model = TextCNN(config, **kwargs) elif model_name == 'vdcnn': model = VDCNN(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path): raise FileNotFoundError('Model Not Found: {}'.format(model_save_path)) # load the best model model.load_best_model() data = load_processed_data(variation, input_level, data_type) if data is None: return None, config.exp_name if return_proba: return model.predict_proba(data), config.exp_name else: return model.predict(data), config.exp_name
for variation in VARIATIONS: model_dev_pred_probas = [] model_dev_pred_classes = [] model_test_pred_probas = [] model_test_pred_classes = [] dl_model_names = ['bilstm'] ml_model_names = ['mnb'] bilstm_index, mnb_index = -1, -1 for idx, name in enumerate(dl_model_names + ml_model_names): if name == 'bilstm': bilstm_index = idx elif name == 'mnb': mnb_index = idx fallback = mnb_index if mnb_index != -1 else bilstm_index dev_data_label = load_processed_data(variation, 'word', 'dev')['label'] ensemble_log = { 'ensmeble_models': [], 'binary_threshold': binary_threshold } for dl_model_name in dl_model_names: dev_pred_proba, exp_name = predict_dl_model('dev', variation, 'word', 'w2v_data', True, 64, 0.001, 'adam', dl_model_name,
def train_dl_model(variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, binary_threshold=0.5, checkpoint_dir=None, overwrite=False, log_error=False, save_log=True, **kwargs): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.binary_threshold = binary_threshold if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir if not os.path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) config.exp_name = '{}_{}_{}_{}_{}'.format( variation, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') train_log = { 'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'learning_rate': learning_rate, 'binary_threshold': binary_threshold } print('Logging Info - Experiment: ', config.exp_name) if model_name == 'bilstm': model = BiLSTM(config, **kwargs) elif model_name == 'cnnrnn': model = CNNRNN(config, **kwargs) elif model_name == 'dcnn': model = DCNN(config, **kwargs) elif model_name == 'dpcnn': model = DPCNN(config, **kwargs) elif model_name == 'han': model = HAN(config, **kwargs) elif model_name == 'multicnn': model = MultiTextCNN(config, **kwargs) elif model_name == 'rcnn': model = RCNN(config, **kwargs) elif model_name == 'rnncnn': model = RNNCNN(config, **kwargs) elif model_name == 'cnn': model = TextCNN(config, **kwargs) elif model_name == 'vdcnn': model = VDCNN(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) train_input = load_processed_data(variation, input_level, 'train') dev_input = load_processed_data(variation, input_level, 'dev') test_input = load_processed_data(variation, input_level, 'test') model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_input, dev_input) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) # load the best model model.load_best_model() print('Logging Info - Evaluate over valid data:') valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate( dev_input) train_log['valid_acc'] = valid_acc train_log['valid_f1'] = valid_f1 train_log['valid_macro_f1'] = valid_macro_f1 train_log['valid_p'] = valid_p train_log['valid_r'] = valid_r train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if log_error: error_indexes, error_pred_probas = model.error_analyze(dev_input) dev_text_input = load_processed_text_data(variation, 'dev') for error_index, error_pred_prob in zip(error_indexes, error_pred_probas): train_log['error_%d' % error_index] = '{},{},{},{}'.format( error_index, dev_text_input['sentence'][error_index], dev_text_input['label'][error_index], error_pred_prob) if save_log: write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation), log=train_log, mode='a') if test_input is not None: test_predictions = model.predict(test_input) writer_predict( format_filename(PREDICT_DIR, config.exp_name + '.labels'), test_predictions) return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
def train_match_model(variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, encoder_type='concat_attention', metrics='euclidean', checkpoint_dir=None, overwrite=False): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir if not os.path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) config.exp_name = '{}_dialect_match_{}_{}_{}_{}_{}'.format( variation, encoder_type, metrics, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') config.checkpoint_monitor = 'val_loss' config.early_stopping_monitor = 'val_loss' train_log = { 'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'learning_rate': learning_rate } model = DialectMatchModel(config, encoder_type='concat_attention', metrics='euclidean') train_input = load_processed_data(variation, input_level, 'train') dev_input = load_processed_data(variation, input_level, 'dev') model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_input, dev_input) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) # load the best model model.load_best_model() print('Logging Info - Evaluate over valid data:') valid_acc, valid_f1 = model.evaluate(dev_input) train_log['valid_acc'] = valid_acc train_log['valid_f1'] = valid_f1 train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation + '_match'), log=train_log, mode='a') return valid_acc, valid_f1