示例#1
0
def train(ticker):
    data = load_processed_data(ticker)
    data = data.loc[:"2020-12-31"]
    data = data.iloc[:-1]
    X_train, X_test, y_train, y_test = split(data)
    model = train_model(X_train, X_test, y_train, y_test, params)
    return model
示例#2
0
    def get_training_data(ticker):
        data = load_processed_data(ticker)
        data = data.loc[:"2020-12-31"]
        data = data.iloc[:-1]
        data = data[['CLOSE']]
        # Pre-process
        log_data = np.log(data[['CLOSE']])
        log_data = np.array(log_data['CLOSE'])
        diff_log_data = np.diff(log_data)

        return diff_log_data
示例#3
0
def get_training_data(ticker: str) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Loads the processed data corresponding to the input ticker.
    Removes any rows that are supposed to be used for testing (i.e. data from Jan 2021 onwards).
    Splits the training data into predictors and labels
    """
    data = load_processed_data(ticker)
    data = data.loc[:"2020-12-31"]

    # Drop last row because it would have had 2021's first day's change in price
    data = data.iloc[:-1]

    predictors, labels = data.drop("LABELS", axis=1), data["LABELS"]
    return predictors, labels
def build_arima():
    futures_list = get_futures_list(filter_insignificant_lag_1_acf=True)
    for ticker in futures_list:
        print(f"{futures_list.index(ticker) + 1}/{len(futures_list)}: {ticker}")
        data = load_processed_data(ticker)
        data = data.loc[:"2020-12-31"]
        data = data.iloc[:-1]   
        data = data[['CLOSE']]
        arima_model = pmdarima.auto_arima(data)
        arima_model = arima_model.fit(data)
        p, d, q = arima_model.order
        arima_residuals = arima_model.arima_res_.resid
        
        params = {"p": p, "q": q, "d":d, "residuals": list(arima_residuals)}
        
        # Save model
        dire = f"./models/arima/param2/"
        os.makedirs(os.path.dirname(dire), exist_ok=True)
        with open(f'{dire}/{ticker}_params.txt', 'w') as f:
            json.dump(params, f, ensure_ascii=False)
            print(f"Saved parameters for {ticker}")
    def __init__(self, genre, level, data_type, batch_size, elmocache: ELMoCache, shuffle=True, return_data=False,
                 return_features=False, return_label=True):
        """
        :param elmocache:  instance of ELMoCache, used to genrate elmo embedding
        """
        self.input_data = load_processed_data(genre, level, data_type)
        self.input_premise = self.input_data['premise']
        self.input_hypothesis = self.input_data['hypothesis']
        self.input_label = self.input_data['label']
        assert self.input_hypothesis.shape[0] == self.input_hypothesis.shape[0] == self.input_label.shape[0]
        self.data_size = self.input_hypothesis.shape[0]
        self.indexes = np.arange(self.data_size)

        self.batch_size = batch_size
        self.elmocache = elmocache
        self.shuffle = shuffle
        self.return_data = return_data      # whether to return original data
        self.return_features = return_features  # whether to return additional statistical features
        self.return_label = return_label    # whether to return label

        if self.return_features:
            self.features = load_features(genre, data_type)
from models.xgboost.training_util import train_model
from systems.systems_util import get_futures_list
from utils.data_loader import load_processed_data

# Cross validation
ts_crossval = TimeSeriesSplit(n_splits=5)

# Define search space for bayesian optimisation
XGB_param_hyperopt = {
    "booster": hp.choice("booster", ["gblinear"]),
    "max_depth": scope.int(hp.quniform("max_depth", 1, 5, 1)),
    "subsample": hp.uniform("subsample", 0.4, 0.6),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 0.6),
    "colsample_bynode": hp.uniform("colsample_bynode", 0.4, 0.6),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.4, 0.6),
    "gamma": hp.uniform("gamma", 0, 10),
    "min_child_weight": hp.uniform("min_child_weight", 1.5, 2.3),
    "n_estimators": 100,
    "reg_lambda": hp.uniform("reg_lambda", 1, 8),
    "reg_alpha": hp.uniform("reg_alpha", 0, 0.02),
}

futures = get_futures_list(filter_insignificant_lag=1)

# Pre-train models and save the weights
for ticker in futures:
    data = load_processed_data(ticker)
    data = data.loc[:"2020-12-31"]
    data = data.iloc[:-1]
    train_model(data, XGB_param_hyperopt, ts_crossval, ticker)
示例#7
0
def predict_dl_model(data_type,
                     variation,
                     input_level,
                     word_embed_type,
                     word_embed_trainable,
                     batch_size,
                     learning_rate,
                     optimizer_type,
                     model_name,
                     checkpoint_dir=None,
                     return_proba=True,
                     **kwargs):
    config = ModelConfig()
    config.variation = variation
    config.input_level = input_level
    if '_aug' in variation:
        config.max_len = {
            'word': config.aug_word_max_len,
            'char': config.aug_char_max_len
        }
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.word_embeddings = np.load(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        variation=variation,
                        type=word_embed_type))
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
    config.exp_name = '{}_{}_{}_{}_{}'.format(
        variation, model_name, input_level, word_embed_type,
        'tune' if word_embed_trainable else 'fix')

    print('Logging Info - Experiment: ', config.exp_name)
    if model_name == 'bilstm':
        model = BiLSTM(config, **kwargs)
    elif model_name == 'cnnrnn':
        model = CNNRNN(config, **kwargs)
    elif model_name == 'dcnn':
        model = DCNN(config, **kwargs)
    elif model_name == 'dpcnn':
        model = DPCNN(config, **kwargs)
    elif model_name == 'han':
        model = HAN(config, **kwargs)
    elif model_name == 'multicnn':
        model = MultiTextCNN(config, **kwargs)
    elif model_name == 'rcnn':
        model = RCNN(config, **kwargs)
    elif model_name == 'rnncnn':
        model = RNNCNN(config, **kwargs)
    elif model_name == 'cnn':
        model = TextCNN(config, **kwargs)
    elif model_name == 'vdcnn':
        model = VDCNN(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if not path.exists(model_save_path):
        raise FileNotFoundError('Model Not Found: {}'.format(model_save_path))
    # load the best model
    model.load_best_model()

    data = load_processed_data(variation, input_level, data_type)

    if data is None:
        return None, config.exp_name

    if return_proba:
        return model.predict_proba(data), config.exp_name
    else:
        return model.predict(data), config.exp_name
示例#8
0
        for variation in VARIATIONS:
            model_dev_pred_probas = []
            model_dev_pred_classes = []
            model_test_pred_probas = []
            model_test_pred_classes = []
            dl_model_names = ['bilstm']
            ml_model_names = ['mnb']
            bilstm_index, mnb_index = -1, -1
            for idx, name in enumerate(dl_model_names + ml_model_names):
                if name == 'bilstm':
                    bilstm_index = idx
                elif name == 'mnb':
                    mnb_index = idx
            fallback = mnb_index if mnb_index != -1 else bilstm_index

            dev_data_label = load_processed_data(variation, 'word',
                                                 'dev')['label']
            ensemble_log = {
                'ensmeble_models': [],
                'binary_threshold': binary_threshold
            }

            for dl_model_name in dl_model_names:
                dev_pred_proba, exp_name = predict_dl_model('dev',
                                                            variation,
                                                            'word',
                                                            'w2v_data',
                                                            True,
                                                            64,
                                                            0.001,
                                                            'adam',
                                                            dl_model_name,
示例#9
0
def train_dl_model(variation,
                   input_level,
                   word_embed_type,
                   word_embed_trainable,
                   batch_size,
                   learning_rate,
                   optimizer_type,
                   model_name,
                   binary_threshold=0.5,
                   checkpoint_dir=None,
                   overwrite=False,
                   log_error=False,
                   save_log=True,
                   **kwargs):
    config = ModelConfig()
    config.variation = variation
    config.input_level = input_level
    if '_aug' in variation:
        config.max_len = {
            'word': config.aug_word_max_len,
            'char': config.aug_char_max_len
        }
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.word_embeddings = np.load(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        variation=variation,
                        type=word_embed_type))
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.binary_threshold = binary_threshold
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
        if not os.path.exists(config.checkpoint_dir):
            os.makedirs(config.checkpoint_dir)
    config.exp_name = '{}_{}_{}_{}_{}'.format(
        variation, model_name, input_level, word_embed_type,
        'tune' if word_embed_trainable else 'fix')

    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'learning_rate': learning_rate,
        'binary_threshold': binary_threshold
    }

    print('Logging Info - Experiment: ', config.exp_name)
    if model_name == 'bilstm':
        model = BiLSTM(config, **kwargs)
    elif model_name == 'cnnrnn':
        model = CNNRNN(config, **kwargs)
    elif model_name == 'dcnn':
        model = DCNN(config, **kwargs)
    elif model_name == 'dpcnn':
        model = DPCNN(config, **kwargs)
    elif model_name == 'han':
        model = HAN(config, **kwargs)
    elif model_name == 'multicnn':
        model = MultiTextCNN(config, **kwargs)
    elif model_name == 'rcnn':
        model = RCNN(config, **kwargs)
    elif model_name == 'rnncnn':
        model = RNNCNN(config, **kwargs)
    elif model_name == 'cnn':
        model = TextCNN(config, **kwargs)
    elif model_name == 'vdcnn':
        model = VDCNN(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))

    train_input = load_processed_data(variation, input_level, 'train')
    dev_input = load_processed_data(variation, input_level, 'dev')
    test_input = load_processed_data(variation, input_level, 'test')

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if not path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_input, dev_input)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s',
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    # load the best model
    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate(
        dev_input)
    train_log['valid_acc'] = valid_acc
    train_log['valid_f1'] = valid_f1
    train_log['valid_macro_f1'] = valid_macro_f1
    train_log['valid_p'] = valid_p
    train_log['valid_r'] = valid_r
    train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())

    if log_error:
        error_indexes, error_pred_probas = model.error_analyze(dev_input)
        dev_text_input = load_processed_text_data(variation, 'dev')
        for error_index, error_pred_prob in zip(error_indexes,
                                                error_pred_probas):
            train_log['error_%d' % error_index] = '{},{},{},{}'.format(
                error_index, dev_text_input['sentence'][error_index],
                dev_text_input['label'][error_index], error_pred_prob)
    if save_log:
        write_log(format_filename(LOG_DIR,
                                  PERFORMANCE_LOG_TEMPLATE,
                                  variation=variation),
                  log=train_log,
                  mode='a')

    if test_input is not None:
        test_predictions = model.predict(test_input)
        writer_predict(
            format_filename(PREDICT_DIR, config.exp_name + '.labels'),
            test_predictions)

    return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
示例#10
0
def train_match_model(variation,
                      input_level,
                      word_embed_type,
                      word_embed_trainable,
                      batch_size,
                      learning_rate,
                      optimizer_type,
                      encoder_type='concat_attention',
                      metrics='euclidean',
                      checkpoint_dir=None,
                      overwrite=False):
    config = ModelConfig()
    config.variation = variation
    config.input_level = input_level
    if '_aug' in variation:
        config.max_len = {
            'word': config.aug_word_max_len,
            'char': config.aug_char_max_len
        }
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.word_embeddings = np.load(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        variation=variation,
                        type=word_embed_type))
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
        if not os.path.exists(config.checkpoint_dir):
            os.makedirs(config.checkpoint_dir)
    config.exp_name = '{}_dialect_match_{}_{}_{}_{}_{}'.format(
        variation, encoder_type, metrics, input_level, word_embed_type,
        'tune' if word_embed_trainable else 'fix')
    config.checkpoint_monitor = 'val_loss'
    config.early_stopping_monitor = 'val_loss'
    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'learning_rate': learning_rate
    }

    model = DialectMatchModel(config,
                              encoder_type='concat_attention',
                              metrics='euclidean')
    train_input = load_processed_data(variation, input_level, 'train')
    dev_input = load_processed_data(variation, input_level, 'dev')

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if not path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_input, dev_input)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s',
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    # load the best model
    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    valid_acc, valid_f1 = model.evaluate(dev_input)
    train_log['valid_acc'] = valid_acc
    train_log['valid_f1'] = valid_f1
    train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())

    write_log(format_filename(LOG_DIR,
                              PERFORMANCE_LOG_TEMPLATE,
                              variation=variation + '_match'),
              log=train_log,
              mode='a')
    return valid_acc, valid_f1