def features(self, clf) -> pd.DataFrame: res = [] for feature_name, features in self.feature_columns_dict.items(): logger.debug('Checking feature %s for %s r=%s s=%s', feature_name, self.stock, self.r, self.s) train_x = self.df[features] n_components = self.get_number_of_pca_components(feature_name) if n_components: pca = PCA(n_components=n_components) pca.fit(train_x) train_x = pca.transform(train_x) scores = model.validate_model(clf, train_x, self.df['mid_price_indicator']) df_score = pd.DataFrame(scores) scores = { 'matthews': np.mean(df_score['matthews'].values), 'f1': np.mean(df_score['f1'].values), 'roc_auc': np.mean(df_score['roc_auc'].values), 'stock': self.stock, 'features': feature_name } logger.info('Scores %s r=%s s=%s - %s', self.stock, self.r, self.s, scores) res.append(scores) return pd.DataFrame(res)
def svm_classification(df, gdf_columns, C=1000, gamma=1) -> dict: clf = SVC(kernel='rbf', C=C, gamma=gamma) X = df.loc[:, gdf_columns] y = df['mid_price_indicator'].values.reshape(-1, 1) # y[0] = 0 scores = model.validate_model(clf, X, y) return scores
def train_clf_with_split_pca(self, clf, feature_name, method=None): """ Deprecated """ logger.info('Training %s r=%s s=%s:', self.stock, self.r, self.s) train_x = self.df[self.feature_columns_dict[feature_name]] train_pca = train_x[[col for col in train_x.columns if 'gdf' in col]] train_x = train_x[[col for col in train_x.columns if 'gdf' not in col]] n_components = self.get_number_of_pca_components(feature_name) pca = None if n_components: pca = PCA(n_components=n_components) pca.fit(train_pca) train_pca = pca.transform(train_pca) for n in range(n_components): train_x['pca_{}'.format(n)] = train_pca[:, n] scores = model.validate_model(clf, train_x, self.df['mid_price_indicator']) res = { **self.get_mean_scores(scores), 'stock': self.stock, 'method': method, 'features': feature_name } test_scores = self.get_score_for_clf_split_pca( clf, self.df_test, feature_name=feature_name, pca=pca) logger.info('Finished training %s %s', self.stock, { **res, **test_scores }) return {**res, **test_scores}
def train_svm(self, stock, C, gamma, feature_name, kernel='rbf', coef0=np.nan): if coef0: clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0) else: clf = SVC(kernel=kernel, C=C, gamma=gamma) train_x = self.dfs[stock][self.feature_columns_dict[feature_name]] n_components = self.get_number_of_pca_components(feature_name) pca = None if n_components: pca = PCA(n_components=n_components) pca.fit(train_x) train_x = pca.transform(train_x) scores = model.validate_model(clf, train_x, self.dfs[stock]['mid_price_indicator']) res = { **self.get_mean_scores(scores), 'stock': stock, 'C': C, 'gamma': gamma, 'coef0': coef0, 'kernel': kernel, 'features': feature_name } test_scores = self.get_score_for_clf(clf, self.dfs_test[stock], feature_name=feature_name, pca=pca) return {**res, **test_scores}
def train_svm(self, C=None, gamma=None, feature_name='', kernel='rbf', coef0=np.nan, clf=None): logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s', self.stock, self.r, self.s, kernel, C, gamma, coef0) if not clf: if C and gamma and coef0: clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0) elif C and gamma: clf = SVC(kernel=kernel, C=C, gamma=gamma) else: clf = SVC(kernel=kernel, C=C) train_x = self.df[self.feature_columns_dict[feature_name]] train_pca = train_x[[col for col in train_x.columns if 'gdf' in col]] train_x = train_x[[col for col in train_x.columns if 'gdf' not in col]] n_components = self.get_number_of_pca_components(feature_name) pca = None if n_components: pca = PCA(n_components=n_components) pca.fit(train_pca) train_pca = pca.transform(train_pca) for n in range(n_components): train_x['pca_{}'.format(n)] = train_pca[:, n] print(train_x.columns) scores = model.validate_model(clf, train_x, self.df['mid_price_indicator']) res = { **self.get_mean_scores(scores), 'stock': self.stock, 'C': C, 'gamma': gamma, 'coef0': coef0, 'kernel': kernel, 'features': feature_name } test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca) logger.info('Finished training %s %s', self.stock, {**res, **test_scores}) return {**res, **test_scores}
def train_svm(self, C=np.nan, gamma=np.nan, feature_name='', kernel='rbf', coef0=np.nan, should_validate=True, class_weight=None): logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s', self.stock, self.r, self.s, kernel, C, gamma, coef0) if C and gamma and coef0: clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0) elif C and gamma: clf = SVC(kernel=kernel, C=C, gamma=gamma) else: clf = SVC(kernel=kernel) train_x = self.df[self.feature_columns_dict[feature_name]] pca = self.get_pca(feature_name) if pca: train_x = pca.transform(train_x) if should_validate: scores_arrays = model.validate_model( clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) scores = self.get_mean_scores(scores_arrays) else: scores = model.train_model(clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) components_num = None if pca: components_num = pca.n_components_ res = { **scores, 'stock': self.stock, 'C': C, 'gamma': gamma, 'coef0': coef0, 'kernel': kernel, 'features': feature_name, 'pca_components': components_num } test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca) logger.info('Finished training %s %s', self.stock, { **res, **test_scores }) return {**res, **test_scores}
def train_clf(self, clf, feature_name='', should_validate=True, method=None, class_weight=None): logger.info('Training %s r=%s s=%s: clf=%s', self.stock, self.r, self.s, clf) train_x = self.df[self.feature_columns_dict[feature_name]] pca = self.get_pca(feature_name) if pca: train_x = pca.transform(train_x) if should_validate: scores_arrays = model.validate_model( clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) scores = self.get_mean_scores(scores_arrays) else: scores = model.train_model(clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) if not method: method = 'logistic' components_num = None if pca: components_num = pca.n_components_ res = { **scores, 'stock': self.stock, 'kernel': method, 'features': feature_name, 'pca_components': components_num } test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca) logger.info('Finished training %s %s', self.stock, { **res, **test_scores }) return {**res, **test_scores}
def get_logistic_regression(stock, data_length): df, df_test = lob.load_prepared_data(stock, data_dir='../gaussian_filter/data', cv=False, length=data_length) clf = LogisticRegression() train_x = df[['queue_imbalance']] scores = model.validate_model(clf, train_x, df['mid_price_indicator'], folds=2) res = { **get_mean_scores(scores), 'stock': stock, 'kernel': 'logistic', } test_scores = get_score_for_clf(clf, df_test) return {**res, **test_scores}
def train_svm(self, C=None, gamma=None, feature_name='', kernel='rbf', coef0=np.nan, clf=None): if not np.any(self.df): return {} if not clf: logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s', self.stock, self.r, self.s, kernel, C, gamma, coef0) if C and gamma and coef0: clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0) elif C and gamma: clf = SVC(kernel=kernel, C=C, gamma=gamma) else: clf = SVC(kernel=kernel) logger.info('Training %s r=%s s=%s: clf=%s', self.stock, self.r, self.s, clf) train_x = self.df[self.feature_columns_dict[feature_name]] n_components = self.get_number_of_pca_components(feature_name) pca = None if n_components: pca = PCA(n_components=n_components) pca.fit(train_x) train_x = pca.transform(train_x) logger.info('Will validate model for stock=%s r=%s s=%s', self.stock, self.r, self.s) scores = model.validate_model(clf, train_x, self.df['mid_price_indicator']) res = { **self.get_mean_scores(scores), 'stock': self.stock, 'C': C, 'gamma': gamma, 'coef0': coef0, 'kernel': kernel, 'features': feature_name } test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca) logger.info('Finished training %s %s', self.stock, {**res, **test_scores}) return {**res, **test_scores}
def train_clf(self, clf, feature_name='', should_validate=True, method=None, class_weight=None): logger.info('Training %s: clf=%s', self.stock, clf) train_x = self.df[self.feature_columns_dict[feature_name]] if should_validate: scores_arrays = model.validate_model( clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) scores = self.get_mean_scores(scores_arrays) else: scores = model.train_model(clf, train_x, self.df['mid_price_indicator'], class_weight=class_weight) if not method: method = 'logistic' res = { **scores, 'stock': self.stock, 'kernel': method, 'features': feature_name, } test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name) logger.info('Finished training %s %s', self.stock, { **res, **test_scores }) return {**res, **test_scores}
def svm_classification(df, gdf_columns) -> dict: clf = LogisticRegression() X = df.loc[:, gdf_columns] y = df['mid_price_indicator'].values.reshape(-1, 1) scores = model.validate_model(clf, X, y) return scores