示例#1
0
    def features(self, clf) -> pd.DataFrame:
        res = []

        for feature_name, features in self.feature_columns_dict.items():
            logger.debug('Checking feature %s for %s r=%s s=%s', feature_name,
                         self.stock, self.r, self.s)
            train_x = self.df[features]
            n_components = self.get_number_of_pca_components(feature_name)
            if n_components:
                pca = PCA(n_components=n_components)
                pca.fit(train_x)
                train_x = pca.transform(train_x)
            scores = model.validate_model(clf, train_x,
                                          self.df['mid_price_indicator'])
            df_score = pd.DataFrame(scores)
            scores = {
                'matthews': np.mean(df_score['matthews'].values),
                'f1': np.mean(df_score['f1'].values),
                'roc_auc': np.mean(df_score['roc_auc'].values),
                'stock': self.stock,
                'features': feature_name
            }
            logger.info('Scores %s r=%s s=%s - %s', self.stock, self.r, self.s,
                        scores)
            res.append(scores)
        return pd.DataFrame(res)
def svm_classification(df, gdf_columns, C=1000, gamma=1) -> dict:
    clf = SVC(kernel='rbf', C=C, gamma=gamma)
    X = df.loc[:, gdf_columns]
    y = df['mid_price_indicator'].values.reshape(-1, 1)
    #     y[0] = 0
    scores = model.validate_model(clf, X, y)
    return scores
示例#3
0
 def train_clf_with_split_pca(self, clf, feature_name, method=None):
     """
     Deprecated
     """
     logger.info('Training %s r=%s s=%s:', self.stock, self.r, self.s)
     train_x = self.df[self.feature_columns_dict[feature_name]]
     train_pca = train_x[[col for col in train_x.columns if 'gdf' in col]]
     train_x = train_x[[col for col in train_x.columns if 'gdf' not in col]]
     n_components = self.get_number_of_pca_components(feature_name)
     pca = None
     if n_components:
         pca = PCA(n_components=n_components)
         pca.fit(train_pca)
         train_pca = pca.transform(train_pca)
     for n in range(n_components):
         train_x['pca_{}'.format(n)] = train_pca[:, n]
     scores = model.validate_model(clf, train_x,
                                   self.df['mid_price_indicator'])
     res = {
         **self.get_mean_scores(scores), 'stock': self.stock,
         'method': method,
         'features': feature_name
     }
     test_scores = self.get_score_for_clf_split_pca(
         clf, self.df_test, feature_name=feature_name, pca=pca)
     logger.info('Finished training %s %s', self.stock, {
         **res,
         **test_scores
     })
     return {**res, **test_scores}
示例#4
0
    def train_svm(self,
                  stock,
                  C,
                  gamma,
                  feature_name,
                  kernel='rbf',
                  coef0=np.nan):
        if coef0:
            clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0)
        else:
            clf = SVC(kernel=kernel, C=C, gamma=gamma)
        train_x = self.dfs[stock][self.feature_columns_dict[feature_name]]
        n_components = self.get_number_of_pca_components(feature_name)
        pca = None
        if n_components:
            pca = PCA(n_components=n_components)
            pca.fit(train_x)
            train_x = pca.transform(train_x)

        scores = model.validate_model(clf, train_x,
                                      self.dfs[stock]['mid_price_indicator'])
        res = {
            **self.get_mean_scores(scores), 'stock': stock,
            'C': C,
            'gamma': gamma,
            'coef0': coef0,
            'kernel': kernel,
            'features': feature_name
        }
        test_scores = self.get_score_for_clf(clf,
                                             self.dfs_test[stock],
                                             feature_name=feature_name,
                                             pca=pca)
        return {**res, **test_scores}
 def train_svm(self, C=None, gamma=None, feature_name='', kernel='rbf', coef0=np.nan, clf=None):
     logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s',
                 self.stock, self.r, self.s, kernel, C, gamma, coef0)
     if not clf:
         if C and gamma and coef0:
             clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0)
         elif C and gamma:
             clf = SVC(kernel=kernel, C=C, gamma=gamma)
         else:
             clf = SVC(kernel=kernel, C=C)
     train_x = self.df[self.feature_columns_dict[feature_name]]
     train_pca = train_x[[col for col in train_x.columns if 'gdf' in col]]
     train_x = train_x[[col for col in train_x.columns if 'gdf' not in col]]
     n_components = self.get_number_of_pca_components(feature_name)
     pca = None
     if n_components:
         pca = PCA(n_components=n_components)
         pca.fit(train_pca)
         train_pca = pca.transform(train_pca)
     for n in range(n_components):
         train_x['pca_{}'.format(n)] = train_pca[:, n]
     print(train_x.columns)
     scores = model.validate_model(clf, train_x, self.df['mid_price_indicator'])
     res = {
         **self.get_mean_scores(scores),
         'stock': self.stock,
         'C': C,
         'gamma': gamma,
         'coef0': coef0,
         'kernel': kernel,
         'features': feature_name
     }
     test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca)
     logger.info('Finished training %s %s', self.stock, {**res, **test_scores})
     return {**res, **test_scores}
示例#6
0
 def train_svm(self,
               C=np.nan,
               gamma=np.nan,
               feature_name='',
               kernel='rbf',
               coef0=np.nan,
               should_validate=True,
               class_weight=None):
     logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s',
                 self.stock, self.r, self.s, kernel, C, gamma, coef0)
     if C and gamma and coef0:
         clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0)
     elif C and gamma:
         clf = SVC(kernel=kernel, C=C, gamma=gamma)
     else:
         clf = SVC(kernel=kernel)
     train_x = self.df[self.feature_columns_dict[feature_name]]
     pca = self.get_pca(feature_name)
     if pca:
         train_x = pca.transform(train_x)
     if should_validate:
         scores_arrays = model.validate_model(
             clf,
             train_x,
             self.df['mid_price_indicator'],
             class_weight=class_weight)
         scores = self.get_mean_scores(scores_arrays)
     else:
         scores = model.train_model(clf,
                                    train_x,
                                    self.df['mid_price_indicator'],
                                    class_weight=class_weight)
     components_num = None
     if pca:
         components_num = pca.n_components_
     res = {
         **scores, 'stock': self.stock,
         'C': C,
         'gamma': gamma,
         'coef0': coef0,
         'kernel': kernel,
         'features': feature_name,
         'pca_components': components_num
     }
     test_scores = self.get_score_for_clf(clf,
                                          self.df_test,
                                          feature_name=feature_name,
                                          pca=pca)
     logger.info('Finished training %s %s', self.stock, {
         **res,
         **test_scores
     })
     return {**res, **test_scores}
示例#7
0
 def train_clf(self,
               clf,
               feature_name='',
               should_validate=True,
               method=None,
               class_weight=None):
     logger.info('Training %s r=%s s=%s: clf=%s', self.stock, self.r,
                 self.s, clf)
     train_x = self.df[self.feature_columns_dict[feature_name]]
     pca = self.get_pca(feature_name)
     if pca:
         train_x = pca.transform(train_x)
     if should_validate:
         scores_arrays = model.validate_model(
             clf,
             train_x,
             self.df['mid_price_indicator'],
             class_weight=class_weight)
         scores = self.get_mean_scores(scores_arrays)
     else:
         scores = model.train_model(clf,
                                    train_x,
                                    self.df['mid_price_indicator'],
                                    class_weight=class_weight)
     if not method:
         method = 'logistic'
     components_num = None
     if pca:
         components_num = pca.n_components_
     res = {
         **scores, 'stock': self.stock,
         'kernel': method,
         'features': feature_name,
         'pca_components': components_num
     }
     test_scores = self.get_score_for_clf(clf,
                                          self.df_test,
                                          feature_name=feature_name,
                                          pca=pca)
     logger.info('Finished training %s %s', self.stock, {
         **res,
         **test_scores
     })
     return {**res, **test_scores}
示例#8
0
def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(stock,
                                         data_dir='../gaussian_filter/data',
                                         cv=False,
                                         length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf,
                                  train_x,
                                  df['mid_price_indicator'],
                                  folds=2)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}
示例#9
0
    def train_svm(self, C=None, gamma=None, feature_name='', kernel='rbf', coef0=np.nan, clf=None):
        if not np.any(self.df):
            return {}

        if not clf:
            logger.info('Training %s r=%s s=%s: kernel=%s C=%s gamma=%s coef0=%s',
                        self.stock, self.r, self.s, kernel, C, gamma, coef0)
            if C and gamma and coef0:
                clf = SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0)
            elif C and gamma:
                clf = SVC(kernel=kernel, C=C, gamma=gamma)
            else:
                clf = SVC(kernel=kernel)
                logger.info('Training %s r=%s s=%s: clf=%s',
                            self.stock, self.r, self.s, clf)
        train_x = self.df[self.feature_columns_dict[feature_name]]
        n_components = self.get_number_of_pca_components(feature_name)
        pca = None
        if n_components:
            pca = PCA(n_components=n_components)
            pca.fit(train_x)
            train_x = pca.transform(train_x)
        logger.info('Will validate model for stock=%s r=%s s=%s', self.stock, self.r, self.s)
        scores = model.validate_model(clf, train_x, self.df['mid_price_indicator'])
        res = {
            **self.get_mean_scores(scores),
            'stock': self.stock,
            'C': C,
            'gamma': gamma,
            'coef0': coef0,
            'kernel': kernel,
            'features': feature_name
        }
        test_scores = self.get_score_for_clf(clf, self.df_test, feature_name=feature_name, pca=pca)
        logger.info('Finished training %s %s', self.stock, {**res, **test_scores})
        return {**res, **test_scores}
示例#10
0
 def train_clf(self,
               clf,
               feature_name='',
               should_validate=True,
               method=None,
               class_weight=None):
     logger.info('Training %s: clf=%s', self.stock, clf)
     train_x = self.df[self.feature_columns_dict[feature_name]]
     if should_validate:
         scores_arrays = model.validate_model(
             clf,
             train_x,
             self.df['mid_price_indicator'],
             class_weight=class_weight)
         scores = self.get_mean_scores(scores_arrays)
     else:
         scores = model.train_model(clf,
                                    train_x,
                                    self.df['mid_price_indicator'],
                                    class_weight=class_weight)
     if not method:
         method = 'logistic'
     res = {
         **scores,
         'stock': self.stock,
         'kernel': method,
         'features': feature_name,
     }
     test_scores = self.get_score_for_clf(clf,
                                          self.df_test,
                                          feature_name=feature_name)
     logger.info('Finished training %s %s', self.stock, {
         **res,
         **test_scores
     })
     return {**res, **test_scores}
def svm_classification(df, gdf_columns) -> dict:
    clf = LogisticRegression()
    X = df.loc[:, gdf_columns]
    y = df['mid_price_indicator'].values.reshape(-1, 1)
    scores = model.validate_model(clf, X, y)
    return scores