Exemplo n.º 1
0
def main():
    index = load_dataset('all_merged', return_index=True)
    for _sym, data in index.items():

        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)

        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        print("--- end ---")
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/svc_hyperparameters.json'
    estFile = './data/datasets/all_merged/estimators/svc_{}.p'
    hyperparameters = {}
    for _sym, data in index.items():
        features = pd.read_csv(data['csv'],
                               sep=',',
                               encoding='utf-8',
                               index_col='Date',
                               parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)
        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")
        # Build pipeline to be used as estimator in bagging classifier
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', SimpleImputer()
            ),  # Replace nan's with the median value between previous and next observation
            (
                's', RobustScaler()
            ),  # Scale data in order to center it and increase robustness against noise and outliers
            # ('k', SelectKBest()), # Select top 10 best features
            # ('u', RandomUnderSampler()),
            ('c', SVC()),
        ])

        # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
        logger.info("Start Grid search")
        CV_rfc = GridSearchCV(estimator=pipeline,
                              param_grid=SVC_PARAM_GRID,
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              verbose=1)
        CV_rfc.fit(X_train, y_train)
        logger.info("End Grid search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = CV_rfc.best_estimator_

        # Test ensemble's performance on training and test sets
        logger.info("Classification report on train set")
        predictions1 = clf.predict(X_train)
        print(classification_report(y_train, predictions1))
        logger.info("Classification report on test set")
        predictions2 = clf.predict(X_test)
        print(classification_report(y_test, predictions2))
        stats = {
            'score': accuracy_score(y_train, predictions1),
            'mse': mean_squared_error(y_train, predictions1),
            'test_score': accuracy_score(y_test, predictions2),
            'test_mse': mean_squared_error(y_test, predictions2),
            'cv_best_mse': -1 * CV_rfc.best_score_,  # CV score is negated MSE
            # 'cv_results': CV_rfc.cv_results_,
            'cv_bestparams': CV_rfc.best_params_,
        }
        print(stats)
        with open(estFile.format(_sym), 'wb') as f:
            pickle.dump(clf, f)
        hyperparameters[_sym] = {
            'estimator': estFile.format(_sym),
            'stats': stats
        }
        # feature_importances = np.mean([
        #     p.named_steps.c.feature_importances_ for p in clf.estimators_
        # ], axis=0)

        # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)}
        # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])}

        # print({
        #     # 'features':sel_features
        #     'feature_importances': labeled,
        #     # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())},
        # })
        with open(resultFile, 'w') as f:  # Save results at every update
            json.dump(hyperparameters, f, indent=4)
        print("--- end ---")
Exemplo n.º 3
0
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json'
    hyperparameters = {}
    if not os.path.exists(resultFile):
        logger.error('no hyperparameters!')
    with open(resultFile, 'r') as f:
        hyperparameters = json.load(f)
    for _sym, data in index.items():
        if _sym not in hyperparameters or not os.path.exists(
                hyperparameters[_sym]['estimator']):
            logger.error('{} does not exist.'.format(_sym))
        else:
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            # Replace nan with infinity so that it can later be imputed to a finite value
            features = features.replace([np.inf, -np.inf], np.nan)

            # Derive target classes from closing price
            target_pct = target_price_variation(features['close'])
            target = target_binned_price_variation(target_pct, n_bins=2)
            # target = target_discrete_price_variation(target_pct)

            # Use selected features
            preselected = hyperparameters[_sym]['features']
            #features = features[preselected]

            imp = IterativeImputer()
            features = pd.DataFrame(imp.fit_transform(features.values),
                                    index=features.index,
                                    columns=features.columns)
            sel = SelectKBest(score_func=f_classif,
                              k=min(30, len(features.columns)))
            sel.fit(features.values, target.values)
            bestfeatures = [
                c for c, f in zip(features.columns, sel.get_support()) if f
            ]
            print("Using features:\n{}".format(bestfeatures))
            features = features[bestfeatures]

            # Split data in train and blind test set with 70:30 ratio,
            #  most ML models don't take sequentiality into account, but our pipeline
            #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
            X_train, X_test, y_train, y_test = train_test_split(
                features.values, target.values, shuffle=False, test_size=0.3)
            # Summarize distribution
            print("Training set: # Features {}, # Samples {}".format(
                X_train.shape[1], X_train.shape[0]))
            plot_class_distribution("Training set", _sym, y_train)
            print("Test set: # Features {}, # Samples {}".format(
                X_test.shape[1], X_test.shape[0]))
            plot_class_distribution("Test set", _sym, y_test)
            if not np.isfinite(X_train).all():
                logger.warning("Training x is not finite!")
            if not np.isfinite(y_train).all():
                logger.warning("Training y is not finite!")
            if not np.isfinite(X_test).all():
                logger.warning("Test x is not finite!")
            if not np.isfinite(y_test).all():
                logger.warning("Test y is not finite!")

            # Build pipeline to be used as estimator in grid search
            #  so that each subset of the data is transformed independently
            #  to avoid contamination between folds.
            pipeline = Pipeline([
                (
                    'i', SimpleImputer()
                ),  # Replace nan's with the median value between previous and next observation
                ('s', RobustScaler()),
                ('c',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
            ])

            # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
            logger.info("Start Grid search")
            CV_rfc = GridSearchCV(estimator=pipeline,
                                  param_grid=DECISIONTREE_PARAM_GRID,
                                  cv=5,
                                  n_jobs=4,
                                  scoring='neg_mean_squared_error',
                                  verbose=1)
            CV_rfc.fit(X_train, y_train)
            logger.info("End Grid search")

            # Take the fitted ensemble with tuned hyperparameters
            clf = CV_rfc.best_estimator_
            # Test ensemble's performance on training and test sets
            logger.info("Classification report on train set")
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")
Exemplo n.º 4
0
def main():
    index = load_dataset('all_merged',
                         return_index=True,
                         index_name='index_improved')
    resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_imp_hyperparameters.json'
    estFile = './data/datasets/all_merged/estimators/randomforest_sfm_imp_{}.p'
    hyperparameters = {}
    if os.path.exists(resultFile):
        with open(resultFile, 'r') as f:
            hyperparameters = json.load(f)
    for _sym, data in index.items():
        features = pd.read_csv(data['csv'],
                               sep=',',
                               encoding='utf-8',
                               index_col='Date',
                               parse_dates=True)
        target = pd.read_csv(data['target_csv'],
                             sep=',',
                             encoding='utf-8',
                             index_col='Date',
                             parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)
        # Derive target classes from closing price
        # target_pct = target_price_variation(features['close'])
        # target = target_binned_price_variation(target_pct, n_bins=2)
        target = target.loc[features.first_valid_index():features.
                            last_valid_index()]['binary_bin']
        # target = target_discrete_price_variation(target_pct)

        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")
        # Build pipeline to be used as estimator in bagging classifier
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', SimpleImputer()
            ),  # Replace nan's with the median value between previous and next observation
            (
                's', RobustScaler()
            ),  # Scale data in order to center it and increase robustness against noise and outliers
            #('k', SelectKBest()), # Select top 10 best features
            #('u', RandomUnderSampler()),
            ('c', RandomForestClassifier()),
        ])
        # Skip grid search if there are already hyperparameters for this set
        if _sym in hyperparameters and os.path.exists(
                hyperparameters[_sym]['estimator']):
            logger.info('{} already exists.'.format(_sym))
        else:
            # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
            logger.info("Start Grid search")
            CV_rfc = GridSearchCV(
                estimator=pipeline,
                param_grid=RANDOMFOREST_PARAM_GRID,
                cv=5,
                n_jobs=4,
                #scoring='neg_mean_squared_error',
                scoring='roc_auc',
                verbose=1)
            CV_rfc.fit(X_train, y_train)
            logger.info("End Grid search")

            # Take the fitted ensemble with tuned hyperparameters
            clf = CV_rfc.best_estimator_

            # Test ensemble's performance on training and test sets
            logger.info("Classification report on train set")
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'test_score_7': accuracy_score(y_test[0:7], predictions2[0:7]),
                'test_mse_7': mean_squared_error(y_test[0:7],
                                                 predictions2[0:7]),
                'train_report': train_report,
                'test_report': test_report,
                'cv_best_score': CV_rfc.best_score_,
                # 'cv_results': CV_rfc.cv_results_,
                'cv_bestparams': CV_rfc.best_params_,
            }
            print(json.dumps(stats, indent=4))
            with open(estFile.format(_sym), 'wb') as f:
                pickle.dump(clf, f)

            input = pd.DataFrame(
                X_train
            )  #.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').replace(np.nan, 0)
            imp = SimpleImputer()
            input = imp.fit_transform(input)
            sca = RobustScaler()
            input = sca.fit_transform(input)
            sfm = SelectFromModel(estimator=clf.named_steps.c, prefit=True)
            sfm.transform(input)
            sup = sfm.get_support()
            sel_features = [c for c, p in zip(features.columns, sup) if p]
            importances = {
                features.columns[i]: v
                for i, v in enumerate(clf.named_steps.c.feature_importances_)
            }
            labeled_importances = {
                str(k): v
                for k, v in sorted(importances.items(),
                                   key=lambda item: -item[1])
            }
            hyperparameters[_sym] = {
                'estimator': estFile.format(_sym),
                'stats': stats,
                'features': sel_features,
                'feature_importances': labeled_importances
            }
            with open(resultFile, 'w') as f:  # Save results at every update
                json.dump(hyperparameters, f, indent=4)
            print("--- end ---")
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json'
    hyperparameters = {}
    if not os.path.exists(resultFile):
        logger.error('no hyperparameters!')
    with open(resultFile, 'r') as f:
        hyperparameters = json.load(f)
    for _sym, data in index.items():
        if _sym not in hyperparameters or not os.path.exists(
                hyperparameters[_sym]['estimator']):
            logger.error('{} does not exist.'.format(_sym))
        else:
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            # Replace nan with infinity so that it can later be imputed to a finite value
            features = features.replace([np.inf, -np.inf], np.nan)
            #features = features[hyperparameters['feature_importances']]

            # Derive target classes from closing price
            target_pct = target_price_variation(features['close'])
            target = target_binned_price_variation(target_pct, n_bins=2)
            # target = target_discrete_price_variation(target_pct)

            # Split data in train and blind test set with 70:30 ratio,
            #  most ML models don't take sequentiality into account, but our pipeline
            #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
            X_train, X_test, y_train, y_test = train_test_split(
                features.values, target.values, shuffle=False, test_size=0.3)
            # Summarize distribution
            print("Training set: # Features {}, # Samples {}".format(
                X_train.shape[1], X_train.shape[0]))
            plot_class_distribution("Training set", _sym, y_train)
            print("Test set: # Features {}, # Samples {}".format(
                X_test.shape[1], X_test.shape[0]))
            plot_class_distribution("Test set", _sym, y_test)
            if not np.isfinite(X_train).all():
                logger.warning("Training x is not finite!")
            if not np.isfinite(y_train).all():
                logger.warning("Training y is not finite!")
            if not np.isfinite(X_test).all():
                logger.warning("Test x is not finite!")
            if not np.isfinite(y_test).all():
                logger.warning("Test y is not finite!")

            # Take the fitted ensemble with tuned hyperparameters
            clf = None
            with open(hyperparameters[_sym]['estimator'], 'rb') as f:
                clf = pickle.load(f)

            # Test ensemble's performance on training and test sets
            logger.info("Classification report on train set")
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")
def main():
    index = load_dataset('all_merged', return_index=True)
    for _sym, data in index.items():
        features, target = get_symbol_features(index, _sym)

        features_p = features[data['features']['ohlcv']].pct_change().replace(
            [np.inf, -np.inf], np.nan)
        features_p.columns = [c + '_p1' for c in features_p.columns]
        features_1 = features_p.shift(1)
        features_1.columns = [c + '_lag1' for c in features_1.columns]
        features_2 = features_p.shift(2)
        features_2.columns = [c + '_lag2' for c in features_2.columns]

        features_mean = features_p.rolling(3).mean()
        features_mean.columns = [c + '_mean_3' for c in features_mean.columns]

        ta = features[data['features']['ta'] + data['features']['ta_7d'] +
                      data['features']['ta_30d']]

        features = pd.concat([
            features['close'], ta, features_p, features_1, features_2,
            features_mean
        ],
                             axis=1)[30:]
        target = target[30:]
        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        logger.info("Start Feature Selection")
        imp = SimpleImputer()
        values = imp.fit_transform(X_train)
        #sel = SelectKBest(score_func=f_classif, k=min(10, X_train.shape[1]))
        feature_count = int(0.3 * X_train.shape[1])
        sel = RFECV(estimator=RandomForestClassifier(),
                    cv=5,
                    verbose=0,
                    n_jobs=4,
                    min_features_to_select=feature_count,
                    scoring='neg_mean_squared_error')
        sel.fit(values, y_train)
        logger.info("End Feature Selection")
        bestfeatures = [
            c for c, f in zip(features.columns, sel.get_support()) if f
        ]
        if not 'close' in bestfeatures:
            bestfeatures += ['close']
        print("Using features:\n{}".format(bestfeatures, len(bestfeatures)))

        train_features = pd.DataFrame(X_train, columns=features.columns)
        test_features = pd.DataFrame(X_test, columns=features.columns)
        X_train = train_features[bestfeatures].values
        X_test = test_features[bestfeatures].values

        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")

        # Build pipeline to be used as estimator in grid search
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', IterativeImputer()
            ),  # Replace nan's with the median value between previous and next observation
            ('s', MinMaxScaler(feature_range=(-1, 1))),
            ('c', MLPClassifier()),
        ])

        # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
        logger.info("Start Grid search")
        CV_rfc = GridSearchCV(estimator=pipeline,
                              param_grid=PARAM_GRID,
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              verbose=1)
        CV_rfc.fit(X_train, y_train)
        logger.info("End Grid search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = CV_rfc.best_estimator_
        # Test ensemble's performance on training and test sets
        logger.info("Classification report on train set")
        predictions1 = clf.predict(X_train)
        train_report = classification_report(y_train,
                                             predictions1,
                                             output_dict=True)
        print(classification_report(y_train, predictions1))
        logger.info("Classification report on test set")
        predictions2 = clf.predict(X_test)
        test_report = classification_report(y_test,
                                            predictions2,
                                            output_dict=True)
        print(classification_report(y_test, predictions2))
        stats = {
            'score': accuracy_score(y_train, predictions1),
            'mse': mean_squared_error(y_train, predictions1),
            'test_score': accuracy_score(y_test, predictions2),
            'test_mse': mean_squared_error(y_test, predictions2),
            'train_report': train_report,
            'test_report': test_report,
        }
        print(CV_rfc.best_params_)
        num_samples = min(y_train.shape[0], y_test.shape[0], 30)
        print("Gains calculated on {} samples only!".format(num_samples))
        print(
            "Train Accuracy: {}\nTrain MSE: {}\nGains on train preds: 100 -> {}"
            .format(
                accuracy_score(y_train, predictions1),
                mean_squared_error(y_train, predictions1),
                test_gains(train_features['close'][0:num_samples],
                           predictions1[0:num_samples],
                           initial_balance=100,
                           position_size=0.1)))
        print(
            "Test Accuracy: {}\nTest MSE: {}\nGains on test preds: 100 -> {}".
            format(
                accuracy_score(y_test, predictions2),
                mean_squared_error(y_test, predictions2),
                test_gains(test_features['close'][0:num_samples],
                           predictions2[0:num_samples],
                           initial_balance=100,
                           position_size=0.1)))
        print("--- end ---")