def main(): index = load_dataset('all_merged', return_index=True) for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/svc_hyperparameters.json' estFile = './data/datasets/all_merged/estimators/svc_{}.p' hyperparameters = {} for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in bagging classifier # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ( 's', RobustScaler() ), # Scale data in order to center it and increase robustness against noise and outliers # ('k', SelectKBest()), # Select top 10 best features # ('u', RandomUnderSampler()), ('c', SVC()), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=SVC_PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'cv_best_mse': -1 * CV_rfc.best_score_, # CV score is negated MSE # 'cv_results': CV_rfc.cv_results_, 'cv_bestparams': CV_rfc.best_params_, } print(stats) with open(estFile.format(_sym), 'wb') as f: pickle.dump(clf, f) hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats } # feature_importances = np.mean([ # p.named_steps.c.feature_importances_ for p in clf.estimators_ # ], axis=0) # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)} # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])} # print({ # # 'features':sel_features # 'feature_importances': labeled, # # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())}, # }) with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json' hyperparameters = {} if not os.path.exists(resultFile): logger.error('no hyperparameters!') with open(resultFile, 'r') as f: hyperparameters = json.load(f) for _sym, data in index.items(): if _sym not in hyperparameters or not os.path.exists( hyperparameters[_sym]['estimator']): logger.error('{} does not exist.'.format(_sym)) else: features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Use selected features preselected = hyperparameters[_sym]['features'] #features = features[preselected] imp = IterativeImputer() features = pd.DataFrame(imp.fit_transform(features.values), index=features.index, columns=features.columns) sel = SelectKBest(score_func=f_classif, k=min(30, len(features.columns))) sel.fit(features.values, target.values) bestfeatures = [ c for c, f in zip(features.columns, sel.get_support()) if f ] print("Using features:\n{}".format(bestfeatures)) features = features[bestfeatures] # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in grid search # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ('s', RobustScaler()), ('c', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=DECISIONTREE_PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True, index_name='index_improved') resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_imp_hyperparameters.json' estFile = './data/datasets/all_merged/estimators/randomforest_sfm_imp_{}.p' hyperparameters = {} if os.path.exists(resultFile): with open(resultFile, 'r') as f: hyperparameters = json.load(f) for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) target = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price # target_pct = target_price_variation(features['close']) # target = target_binned_price_variation(target_pct, n_bins=2) target = target.loc[features.first_valid_index():features. last_valid_index()]['binary_bin'] # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in bagging classifier # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ( 's', RobustScaler() ), # Scale data in order to center it and increase robustness against noise and outliers #('k', SelectKBest()), # Select top 10 best features #('u', RandomUnderSampler()), ('c', RandomForestClassifier()), ]) # Skip grid search if there are already hyperparameters for this set if _sym in hyperparameters and os.path.exists( hyperparameters[_sym]['estimator']): logger.info('{} already exists.'.format(_sym)) else: # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV( estimator=pipeline, param_grid=RANDOMFOREST_PARAM_GRID, cv=5, n_jobs=4, #scoring='neg_mean_squared_error', scoring='roc_auc', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'test_score_7': accuracy_score(y_test[0:7], predictions2[0:7]), 'test_mse_7': mean_squared_error(y_test[0:7], predictions2[0:7]), 'train_report': train_report, 'test_report': test_report, 'cv_best_score': CV_rfc.best_score_, # 'cv_results': CV_rfc.cv_results_, 'cv_bestparams': CV_rfc.best_params_, } print(json.dumps(stats, indent=4)) with open(estFile.format(_sym), 'wb') as f: pickle.dump(clf, f) input = pd.DataFrame( X_train ) #.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').replace(np.nan, 0) imp = SimpleImputer() input = imp.fit_transform(input) sca = RobustScaler() input = sca.fit_transform(input) sfm = SelectFromModel(estimator=clf.named_steps.c, prefit=True) sfm.transform(input) sup = sfm.get_support() sel_features = [c for c, p in zip(features.columns, sup) if p] importances = { features.columns[i]: v for i, v in enumerate(clf.named_steps.c.feature_importances_) } labeled_importances = { str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1]) } hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats, 'features': sel_features, 'feature_importances': labeled_importances } with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json' hyperparameters = {} if not os.path.exists(resultFile): logger.error('no hyperparameters!') with open(resultFile, 'r') as f: hyperparameters = json.load(f) for _sym, data in index.items(): if _sym not in hyperparameters or not os.path.exists( hyperparameters[_sym]['estimator']): logger.error('{} does not exist.'.format(_sym)) else: features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) #features = features[hyperparameters['feature_importances']] # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Take the fitted ensemble with tuned hyperparameters clf = None with open(hyperparameters[_sym]['estimator'], 'rb') as f: clf = pickle.load(f) # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) for _sym, data in index.items(): features, target = get_symbol_features(index, _sym) features_p = features[data['features']['ohlcv']].pct_change().replace( [np.inf, -np.inf], np.nan) features_p.columns = [c + '_p1' for c in features_p.columns] features_1 = features_p.shift(1) features_1.columns = [c + '_lag1' for c in features_1.columns] features_2 = features_p.shift(2) features_2.columns = [c + '_lag2' for c in features_2.columns] features_mean = features_p.rolling(3).mean() features_mean.columns = [c + '_mean_3' for c in features_mean.columns] ta = features[data['features']['ta'] + data['features']['ta_7d'] + data['features']['ta_30d']] features = pd.concat([ features['close'], ta, features_p, features_1, features_2, features_mean ], axis=1)[30:] target = target[30:] # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) logger.info("Start Feature Selection") imp = SimpleImputer() values = imp.fit_transform(X_train) #sel = SelectKBest(score_func=f_classif, k=min(10, X_train.shape[1])) feature_count = int(0.3 * X_train.shape[1]) sel = RFECV(estimator=RandomForestClassifier(), cv=5, verbose=0, n_jobs=4, min_features_to_select=feature_count, scoring='neg_mean_squared_error') sel.fit(values, y_train) logger.info("End Feature Selection") bestfeatures = [ c for c, f in zip(features.columns, sel.get_support()) if f ] if not 'close' in bestfeatures: bestfeatures += ['close'] print("Using features:\n{}".format(bestfeatures, len(bestfeatures))) train_features = pd.DataFrame(X_train, columns=features.columns) test_features = pd.DataFrame(X_test, columns=features.columns) X_train = train_features[bestfeatures].values X_test = test_features[bestfeatures].values # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in grid search # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', IterativeImputer() ), # Replace nan's with the median value between previous and next observation ('s', MinMaxScaler(feature_range=(-1, 1))), ('c', MLPClassifier()), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(CV_rfc.best_params_) num_samples = min(y_train.shape[0], y_test.shape[0], 30) print("Gains calculated on {} samples only!".format(num_samples)) print( "Train Accuracy: {}\nTrain MSE: {}\nGains on train preds: 100 -> {}" .format( accuracy_score(y_train, predictions1), mean_squared_error(y_train, predictions1), test_gains(train_features['close'][0:num_samples], predictions1[0:num_samples], initial_balance=100, position_size=0.1))) print( "Test Accuracy: {}\nTest MSE: {}\nGains on test preds: 100 -> {}". format( accuracy_score(y_test, predictions2), mean_squared_error(y_test, predictions2), test_gains(test_features['close'][0:num_samples], predictions2[0:num_samples], initial_balance=100, position_size=0.1))) print("--- end ---")