def get_symbol_features(index, sym): data = index[sym] features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) return features, target
def main(): index = load_dataset('all_merged', return_index=True) for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/svc_hyperparameters.json' estFile = './data/datasets/all_merged/estimators/svc_{}.p' hyperparameters = {} for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in bagging classifier # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ( 's', RobustScaler() ), # Scale data in order to center it and increase robustness against noise and outliers # ('k', SelectKBest()), # Select top 10 best features # ('u', RandomUnderSampler()), ('c', SVC()), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=SVC_PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'cv_best_mse': -1 * CV_rfc.best_score_, # CV score is negated MSE # 'cv_results': CV_rfc.cv_results_, 'cv_bestparams': CV_rfc.best_params_, } print(stats) with open(estFile.format(_sym), 'wb') as f: pickle.dump(clf, f) hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats } # feature_importances = np.mean([ # p.named_steps.c.feature_importances_ for p in clf.estimators_ # ], axis=0) # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)} # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])} # print({ # # 'features':sel_features # 'feature_importances': labeled, # # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())}, # }) with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json' hyperparameters = {} if not os.path.exists(resultFile): logger.error('no hyperparameters!') with open(resultFile, 'r') as f: hyperparameters = json.load(f) for _sym, data in index.items(): if _sym not in hyperparameters or not os.path.exists( hyperparameters[_sym]['estimator']): logger.error('{} does not exist.'.format(_sym)) else: features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Use selected features preselected = hyperparameters[_sym]['features'] #features = features[preselected] imp = IterativeImputer() features = pd.DataFrame(imp.fit_transform(features.values), index=features.index, columns=features.columns) sel = SelectKBest(score_func=f_classif, k=min(30, len(features.columns))) sel.fit(features.values, target.values) bestfeatures = [ c for c, f in zip(features.columns, sel.get_support()) if f ] print("Using features:\n{}".format(bestfeatures)) features = features[bestfeatures] # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in grid search # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ('s', RobustScaler()), ('c', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=DECISIONTREE_PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---")
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/randomforest_sfm_hyperparameters.json' hyperparameters = {} if not os.path.exists(resultFile): logger.error('no hyperparameters!') with open(resultFile, 'r') as f: hyperparameters = json.load(f) for _sym, data in index.items(): if _sym not in hyperparameters or not os.path.exists( hyperparameters[_sym]['estimator']): logger.error('{} does not exist.'.format(_sym)) else: features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) #features = features[hyperparameters['feature_importances']] # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Take the fitted ensemble with tuned hyperparameters clf = None with open(hyperparameters[_sym]['estimator'], 'rb') as f: clf = pickle.load(f) # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---")