def main(dataset: str):
    ds_service = DatasetService()
    symbols = ds_service.get_dataset_symbols(name=dataset)
    ds_data = {
        s: ds_service.get_dataset(name=dataset, symbol=s).features
        for s in symbols
    }
    # We need to reshape / flatten data
    records = []
    symbol_lookup = {s: i for i, s in enumerate(symbols)}
    for symbol, features in ds_data.items():
        record = {
            'symbol': symbol.replace('USD', ''),
            #'symbol_id': symbol_lookup[symbol]
        }
        for f in features:
            if f.startswith('adrbal1in') and f.endswith('cnt'):
                f = 'adrbal1in{N}cnt'
            elif f.startswith('adrbalntv') and f.endswith('cnt'):
                f = 'adrbalntv{N}cnt'
            elif f.startswith('splyact') and not 'pct' in f:
                f = 'splyact{T}'
            elif f.startswith('splyadrbal1in'):
                f = 'splyadrbal1in{N}'
            elif f.startswith('splyadrbalntv'):
                f = 'splyadrbalntv{N}'
            elif f.startswith('splyadrtop'):
                f = 'splyadrtop{N}'
            elif f.startswith('adrbalusd') and f.endswith('cnt'):
                f = 'adrbalusd{N}cnt'
            elif f.startswith('splyadrbalusd'):
                f = 'splyadrbalusd{N}'
            elif f.startswith('txtfrval') and f.endswith('ntv'):
                f = 'txtfrval{A}ntv'
            elif f.startswith('txtfrval') and f.endswith('usd'):
                f = 'txtfrval{A}usd'
            elif f.startswith('fee') and f.endswith('usd'):
                f = 'fee{A}usd'
            elif f.startswith('gaslmtblk'):
                f = 'gaslmtblk'
            elif f.startswith('gaslmttx'):
                f = 'gaslmttx'
            elif f.startswith('gasusedtx'):
                f = 'gasusedtx'
            elif f.startswith('isccont'):
                f = 'isscont'
            record[f] = 'Y'
        records.append(record)

    result_frame = pd.DataFrame.from_records(records).fillna(value='N')
    #result_frame.set_index(keys='symbol', inplace=True)
    result_frame = result_frame.set_index(keys='symbol').T
    latex = result_frame.to_latex()
    print(result_frame.head())
def main(dataset: str,
         target: str,
         splits: Optional[int] = 1,
         type: Optional[str] = 'sh'):
    dss = DatasetService()
    symbols = dss.get_dataset_symbols(name=dataset)

    lines = []
    for symbol in symbols:
        for pipeline in PIPELINE_LIST:
            lines.append(
                f"python grid_search_new.py {symbol} {dataset} {target} {pipeline} --feature-selection-method importances_shap"
            )

    destfile = f"gridsearch_{dataset}_{target}_all"
    if type == 'cmd':
        with open(destfile + ".cmd", "w") as f:
            f.write("\n".join(["@echo off"] + lines))
    elif type == 'sh':
        with open(destfile + ".sh", "w") as f:
            f.write("\n".join(["#!/bin/bash"] + lines))
    print(f"Grid search script saved to {destfile}")
    return destfile
def main(dataset: str, target: str):
    service = FeatureSelectionService()
    models = ModelService()
    datasets = DatasetService()

    query = {"dataset": dataset, "target": target}
    # Clear feature search results from models
    models.clear_features(query)
    #search_models = models.query_models(query)
    # logging.info("[i] {} models for feature selection".format(len(search_models)))
    # for i, m in enumerate(search_models):
    symbols = datasets.get_dataset_symbols(dataset)
    for i, sym in enumerate(symbols):
        logging.info("==[{}/{}]== Dataset: {} {} {} =====".format(
            i + 1, len(symbols), sym, dataset, target))
        mf = service.create_features_search(target=target,
                                            dataset=dataset,
                                            symbol=sym,
                                            split=0.7,
                                            method='importances')
        logging.info("[{}] Start feature search".format(get_timestamp()))
        mf = service.feature_selection(mf, sync=True)
        logging.info("[{}] End feature search".format(get_timestamp()))
class FeatureSelectionService:
    def __init__(self):
        self.model_repo = ModelRepository()
        self.dataset_service = DatasetService()

    def create_features_search(self,
                               *,
                               symbol: str,
                               dataset: str,
                               target: str,
                               split: float,
                               method: str,
                               task_key: str = None) -> ModelFeatures:
        ds = self.dataset_service.get_dataset(dataset, symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        result = ModelFeatures(dataset=dataset,
                               target=target,
                               symbol=symbol,
                               search_interval=splits['train'],
                               feature_selection_method=method,
                               task_key=task_key or str(uuid4()))
        return result

    def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures:

        # Load dataset
        X = self.dataset_service.get_features(mf.dataset,
                                              mf.symbol,
                                              mf.search_interval.begin,
                                              mf.search_interval.end,
                                              columns=mf.features)
        y = self.dataset_service.get_target(mf.target, mf.symbol,
                                            mf.search_interval.begin,
                                            mf.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(mf.symbol, mf.dataset, mf.target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        mf.start_at = get_timestamp()  # Log starting timestamp
        if not mf.feature_selection_method or mf.feature_selection_method == 'importances':
            selector = select_from_model(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
        elif mf.feature_selection_method == 'importances_cv':
            selector = select_from_model_cv(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_.best_estimator_, X.columns)
        elif mf.feature_selection_method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif mf.feature_selection_method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif mf.feature_selection_method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(
                    mf.feature_selection_method))
        mf.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        mf.features = label_support(selector.get_support(), X.columns)

        # Update model with the new results
        if kwargs.get('save', True):
            self.model_repo.append_features_query(
                {
                    "dataset": mf.dataset,
                    "symbol": mf.symbol,
                    "target": mf.target
                }, mf)
        return mf

    def get_available_symbols(self, dataset: str):
        return self.dataset_service.get_dataset_symbols(name=dataset)

    def feature_selection_new(self, *, symbol: str, dataset: str, target: str,
                              split: float, method: str,
                              **kwargs) -> ModelFeatures:
        ds = self.dataset_service.get_dataset(dataset, symbol)
        fs_exists = DatasetService.has_feature_selection(ds=ds,
                                                         method=method,
                                                         target=target)
        if fs_exists:
            if kwargs.get('replace'):
                self.dataset_service.remove_feature_selection(ds=ds,
                                                              method=method,
                                                              target=target)
            else:
                if kwargs.get('save'):
                    raise MessageException(
                        f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'"
                    )

        splits = DatasetService.get_train_test_split_indices(ds, split)
        fs = FeatureSelection(target=target,
                              method=method,
                              search_interval=splits['train'],
                              task_key=kwargs.get('task_key', str(uuid4())))

        # Load dataset
        X = self.dataset_service.get_dataset_features(
            ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)
        y = self.dataset_service.get_dataset_target(
            name=fs.target,
            ds=ds,
            begin=fs.search_interval.begin,
            end=fs.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(symbol, dataset, target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        fs.start_at = get_timestamp()  # Log starting timestamp
        if not fs.method or 'importances' in fs.method:
            if '_cv' in fs.method:
                selector = select_from_model_cv(X, y)
            else:
                selector = select_from_model(X, y)
            fs.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
            if '_shap' in fs.method:
                fs.shap_values = get_shap_values(
                    model=selector.estimator_.named_steps.c, X=X, X_train=X)
                shap_values = parse_shap_values(fs.shap_values)
        elif fs.method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif fs.method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif fs.method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(fs.method))
        fs.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        fs.features = label_support(selector.get_support(), X.columns)

        if not kwargs.get('save'):
            return fs
        return self.dataset_service.append_feature_selection(ds, fs)