def get_symbol_features(index, sym):
    data = index[sym]
    features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
    # Replace nan with infinity so that it can later be imputed to a finite value
    features = features.replace([np.inf, -np.inf], np.nan)

    # Derive target classes from closing price
    target_pct = target_price_variation(features['close'])
    target = target_binned_price_variation(target_pct, n_bins=2)
    return features, target
예제 #2
0
def main():
    index = load_dataset('all_merged', return_index=True)
    for _sym, data in index.items():

        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)

        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        print("--- end ---")
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/svc_hyperparameters.json'
    estFile = './data/datasets/all_merged/estimators/svc_{}.p'
    hyperparameters = {}
    for _sym, data in index.items():
        features = pd.read_csv(data['csv'],
                               sep=',',
                               encoding='utf-8',
                               index_col='Date',
                               parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)
        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")
        # Build pipeline to be used as estimator in bagging classifier
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', SimpleImputer()
            ),  # Replace nan's with the median value between previous and next observation
            (
                's', RobustScaler()
            ),  # Scale data in order to center it and increase robustness against noise and outliers
            # ('k', SelectKBest()), # Select top 10 best features
            # ('u', RandomUnderSampler()),
            ('c', SVC()),
        ])

        # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
        logger.info("Start Grid search")
        CV_rfc = GridSearchCV(estimator=pipeline,
                              param_grid=SVC_PARAM_GRID,
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              verbose=1)
        CV_rfc.fit(X_train, y_train)
        logger.info("End Grid search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = CV_rfc.best_estimator_

        # Test ensemble's performance on training and test sets
        logger.info("Classification report on train set")
        predictions1 = clf.predict(X_train)
        print(classification_report(y_train, predictions1))
        logger.info("Classification report on test set")
        predictions2 = clf.predict(X_test)
        print(classification_report(y_test, predictions2))
        stats = {
            'score': accuracy_score(y_train, predictions1),
            'mse': mean_squared_error(y_train, predictions1),
            'test_score': accuracy_score(y_test, predictions2),
            'test_mse': mean_squared_error(y_test, predictions2),
            'cv_best_mse': -1 * CV_rfc.best_score_,  # CV score is negated MSE
            # 'cv_results': CV_rfc.cv_results_,
            'cv_bestparams': CV_rfc.best_params_,
        }
        print(stats)
        with open(estFile.format(_sym), 'wb') as f:
            pickle.dump(clf, f)
        hyperparameters[_sym] = {
            'estimator': estFile.format(_sym),
            'stats': stats
        }
        # feature_importances = np.mean([
        #     p.named_steps.c.feature_importances_ for p in clf.estimators_
        # ], axis=0)

        # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)}
        # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])}

        # print({
        #     # 'features':sel_features
        #     'feature_importances': labeled,
        #     # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())},
        # })
        with open(resultFile, 'w') as f:  # Save results at every update
            json.dump(hyperparameters, f, indent=4)
        print("--- end ---")
예제 #4
0
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json'
    hyperparameters = {}
    if not os.path.exists(resultFile):
        logger.error('no hyperparameters!')
    with open(resultFile, 'r') as f:
        hyperparameters = json.load(f)
    for _sym, data in index.items():
        if _sym not in hyperparameters or not os.path.exists(
                hyperparameters[_sym]['estimator']):
            logger.error('{} does not exist.'.format(_sym))
        else:
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            # Replace nan with infinity so that it can later be imputed to a finite value
            features = features.replace([np.inf, -np.inf], np.nan)

            # Derive target classes from closing price
            target_pct = target_price_variation(features['close'])
            target = target_binned_price_variation(target_pct, n_bins=2)
            # target = target_discrete_price_variation(target_pct)

            # Use selected features
            preselected = hyperparameters[_sym]['features']
            #features = features[preselected]

            imp = IterativeImputer()
            features = pd.DataFrame(imp.fit_transform(features.values),
                                    index=features.index,
                                    columns=features.columns)
            sel = SelectKBest(score_func=f_classif,
                              k=min(30, len(features.columns)))
            sel.fit(features.values, target.values)
            bestfeatures = [
                c for c, f in zip(features.columns, sel.get_support()) if f
            ]
            print("Using features:\n{}".format(bestfeatures))
            features = features[bestfeatures]

            # Split data in train and blind test set with 70:30 ratio,
            #  most ML models don't take sequentiality into account, but our pipeline
            #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
            X_train, X_test, y_train, y_test = train_test_split(
                features.values, target.values, shuffle=False, test_size=0.3)
            # Summarize distribution
            print("Training set: # Features {}, # Samples {}".format(
                X_train.shape[1], X_train.shape[0]))
            plot_class_distribution("Training set", _sym, y_train)
            print("Test set: # Features {}, # Samples {}".format(
                X_test.shape[1], X_test.shape[0]))
            plot_class_distribution("Test set", _sym, y_test)
            if not np.isfinite(X_train).all():
                logger.warning("Training x is not finite!")
            if not np.isfinite(y_train).all():
                logger.warning("Training y is not finite!")
            if not np.isfinite(X_test).all():
                logger.warning("Test x is not finite!")
            if not np.isfinite(y_test).all():
                logger.warning("Test y is not finite!")

            # Build pipeline to be used as estimator in grid search
            #  so that each subset of the data is transformed independently
            #  to avoid contamination between folds.
            pipeline = Pipeline([
                (
                    'i', SimpleImputer()
                ),  # Replace nan's with the median value between previous and next observation
                ('s', RobustScaler()),
                ('c',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
            ])

            # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
            logger.info("Start Grid search")
            CV_rfc = GridSearchCV(estimator=pipeline,
                                  param_grid=DECISIONTREE_PARAM_GRID,
                                  cv=5,
                                  n_jobs=4,
                                  scoring='neg_mean_squared_error',
                                  verbose=1)
            CV_rfc.fit(X_train, y_train)
            logger.info("End Grid search")

            # Take the fitted ensemble with tuned hyperparameters
            clf = CV_rfc.best_estimator_
            # Test ensemble's performance on training and test sets
            logger.info("Classification report on train set")
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")
    #cm = pd.read_csv("./data/preprocessed/coinmetrics.io/csv/{}.csv".format(_sym), sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
    ohlcv_7d = builder.periodic_ohlcv_pct_change(ohlcv, period=7, label=True)
    ohlcv_30d = builder.periodic_ohlcv_pct_change(ohlcv, period=30, label=True)
    ta = builder.features_ta(ohlcv)
    ta_7d = builder.period_resampled_ta(ohlcv, period=7)
    ta_30d = builder.period_resampled_ta(ohlcv, period=30)
    dataframes = [ohlcv, ohlcv_7d, ohlcv_30d, ta, ta_7d, ta_30d]
    for i, df in enumerate(dataframes):
        if df.isnull().values.any():
            logger.error('Null values in dataframe {} for symbol {}'.format(
                i, _sym))
    df = pd.concat(dataframes,
                   axis='columns',
                   verify_integrity=True,
                   sort=True)
    df['target_pct'] = builder.target_price_variation(ohlcv, periods=1)

    csv_path = 'data/datasets/resampled_ohlcv_ta/csv/{}.csv'.format(
        _sym.lower())
    xls_path = 'data/datasets/resampled_ohlcv_ta/excel/{}.xlsx'.format(
        _sym.lower())
    df.to_csv(csv_path,
              sep=',',
              encoding='utf-8',
              index=True,
              index_label='Date')
    df.to_excel(xls_path, index=True, index_label='Date')
    corr = df.corr()
    correlation(corr,
                'data/datasets/resampled_ohlcv_ta/correlation/{}.png'.format(
                    _sym.lower()),
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json'
    hyperparameters = {}
    if not os.path.exists(resultFile):
        logger.error('no hyperparameters!')
    with open(resultFile, 'r') as f:
        hyperparameters = json.load(f)
    for _sym, data in index.items():
        if _sym not in hyperparameters or not os.path.exists(
                hyperparameters[_sym]['estimator']):
            logger.error('{} does not exist.'.format(_sym))
        else:
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            # Replace nan with infinity so that it can later be imputed to a finite value
            features = features.replace([np.inf, -np.inf], np.nan)
            #features = features[hyperparameters['feature_importances']]

            # Derive target classes from closing price
            target_pct = target_price_variation(features['close'])
            target = target_binned_price_variation(target_pct, n_bins=2)
            # target = target_discrete_price_variation(target_pct)

            # Split data in train and blind test set with 70:30 ratio,
            #  most ML models don't take sequentiality into account, but our pipeline
            #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
            X_train, X_test, y_train, y_test = train_test_split(
                features.values, target.values, shuffle=False, test_size=0.3)
            # Summarize distribution
            print("Training set: # Features {}, # Samples {}".format(
                X_train.shape[1], X_train.shape[0]))
            plot_class_distribution("Training set", _sym, y_train)
            print("Test set: # Features {}, # Samples {}".format(
                X_test.shape[1], X_test.shape[0]))
            plot_class_distribution("Test set", _sym, y_test)
            if not np.isfinite(X_train).all():
                logger.warning("Training x is not finite!")
            if not np.isfinite(y_train).all():
                logger.warning("Training y is not finite!")
            if not np.isfinite(X_test).all():
                logger.warning("Test x is not finite!")
            if not np.isfinite(y_test).all():
                logger.warning("Test y is not finite!")

            # Take the fitted ensemble with tuned hyperparameters
            clf = None
            with open(hyperparameters[_sym]['estimator'], 'rb') as f:
                clf = pickle.load(f)

            # Test ensemble's performance on training and test sets
            logger.info("Classification report on train set")
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")