def main(dataset: str): ds_service = DatasetService() symbols = ds_service.get_dataset_symbols(name=dataset) ds_data = { s: ds_service.get_dataset(name=dataset, symbol=s).features for s in symbols } # We need to reshape / flatten data records = [] symbol_lookup = {s: i for i, s in enumerate(symbols)} for symbol, features in ds_data.items(): record = { 'symbol': symbol.replace('USD', ''), #'symbol_id': symbol_lookup[symbol] } for f in features: if f.startswith('adrbal1in') and f.endswith('cnt'): f = 'adrbal1in{N}cnt' elif f.startswith('adrbalntv') and f.endswith('cnt'): f = 'adrbalntv{N}cnt' elif f.startswith('splyact') and not 'pct' in f: f = 'splyact{T}' elif f.startswith('splyadrbal1in'): f = 'splyadrbal1in{N}' elif f.startswith('splyadrbalntv'): f = 'splyadrbalntv{N}' elif f.startswith('splyadrtop'): f = 'splyadrtop{N}' elif f.startswith('adrbalusd') and f.endswith('cnt'): f = 'adrbalusd{N}cnt' elif f.startswith('splyadrbalusd'): f = 'splyadrbalusd{N}' elif f.startswith('txtfrval') and f.endswith('ntv'): f = 'txtfrval{A}ntv' elif f.startswith('txtfrval') and f.endswith('usd'): f = 'txtfrval{A}usd' elif f.startswith('fee') and f.endswith('usd'): f = 'fee{A}usd' elif f.startswith('gaslmtblk'): f = 'gaslmtblk' elif f.startswith('gaslmttx'): f = 'gaslmttx' elif f.startswith('gasusedtx'): f = 'gasusedtx' elif f.startswith('isccont'): f = 'isscont' record[f] = 'Y' records.append(record) result_frame = pd.DataFrame.from_records(records).fillna(value='N') #result_frame.set_index(keys='symbol', inplace=True) result_frame = result_frame.set_index(keys='symbol').T latex = result_frame.to_latex() print(result_frame.head())
def main(dataset: str, target: str, splits: Optional[int] = 1, type: Optional[str] = 'sh'): dss = DatasetService() symbols = dss.get_dataset_symbols(name=dataset) lines = [] for symbol in symbols: for pipeline in PIPELINE_LIST: lines.append( f"python grid_search_new.py {symbol} {dataset} {target} {pipeline} --feature-selection-method importances_shap" ) destfile = f"gridsearch_{dataset}_{target}_all" if type == 'cmd': with open(destfile + ".cmd", "w") as f: f.write("\n".join(["@echo off"] + lines)) elif type == 'sh': with open(destfile + ".sh", "w") as f: f.write("\n".join(["#!/bin/bash"] + lines)) print(f"Grid search script saved to {destfile}") return destfile
def main(dataset: str, target: str): service = FeatureSelectionService() models = ModelService() datasets = DatasetService() query = {"dataset": dataset, "target": target} # Clear feature search results from models models.clear_features(query) #search_models = models.query_models(query) # logging.info("[i] {} models for feature selection".format(len(search_models))) # for i, m in enumerate(search_models): symbols = datasets.get_dataset_symbols(dataset) for i, sym in enumerate(symbols): logging.info("==[{}/{}]== Dataset: {} {} {} =====".format( i + 1, len(symbols), sym, dataset, target)) mf = service.create_features_search(target=target, dataset=dataset, symbol=sym, split=0.7, method='importances') logging.info("[{}] Start feature search".format(get_timestamp())) mf = service.feature_selection(mf, sync=True) logging.info("[{}] End feature search".format(get_timestamp()))
class FeatureSelectionService: def __init__(self): self.model_repo = ModelRepository() self.dataset_service = DatasetService() def create_features_search(self, *, symbol: str, dataset: str, target: str, split: float, method: str, task_key: str = None) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) splits = DatasetService.get_train_test_split_indices(ds, split) result = ModelFeatures(dataset=dataset, target=target, symbol=symbol, search_interval=splits['train'], feature_selection_method=method, task_key=task_key or str(uuid4())) return result def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures: # Load dataset X = self.dataset_service.get_features(mf.dataset, mf.symbol, mf.search_interval.begin, mf.search_interval.end, columns=mf.features) y = self.dataset_service.get_target(mf.target, mf.symbol, mf.search_interval.begin, mf.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(mf.symbol, mf.dataset, mf.target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search mf.start_at = get_timestamp() # Log starting timestamp if not mf.feature_selection_method or mf.feature_selection_method == 'importances': selector = select_from_model(X, y) mf.feature_importances = label_feature_importances( selector.estimator_, X.columns) elif mf.feature_selection_method == 'importances_cv': selector = select_from_model_cv(X, y) mf.feature_importances = label_feature_importances( selector.estimator_.best_estimator_, X.columns) elif mf.feature_selection_method == 'fscore': selector = select_percentile(X, y, percentile=10) elif mf.feature_selection_method == 'relieff': selector = select_relieff(X, y, percentile=10) elif mf.feature_selection_method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format( mf.feature_selection_method)) mf.end_at = get_timestamp() # Log ending timestamp # Update search request with results mf.features = label_support(selector.get_support(), X.columns) # Update model with the new results if kwargs.get('save', True): self.model_repo.append_features_query( { "dataset": mf.dataset, "symbol": mf.symbol, "target": mf.target }, mf) return mf def get_available_symbols(self, dataset: str): return self.dataset_service.get_dataset_symbols(name=dataset) def feature_selection_new(self, *, symbol: str, dataset: str, target: str, split: float, method: str, **kwargs) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) fs_exists = DatasetService.has_feature_selection(ds=ds, method=method, target=target) if fs_exists: if kwargs.get('replace'): self.dataset_service.remove_feature_selection(ds=ds, method=method, target=target) else: if kwargs.get('save'): raise MessageException( f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'" ) splits = DatasetService.get_train_test_split_indices(ds, split) fs = FeatureSelection(target=target, method=method, search_interval=splits['train'], task_key=kwargs.get('task_key', str(uuid4()))) # Load dataset X = self.dataset_service.get_dataset_features( ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) y = self.dataset_service.get_dataset_target( name=fs.target, ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search fs.start_at = get_timestamp() # Log starting timestamp if not fs.method or 'importances' in fs.method: if '_cv' in fs.method: selector = select_from_model_cv(X, y) else: selector = select_from_model(X, y) fs.feature_importances = label_feature_importances( selector.estimator_, X.columns) if '_shap' in fs.method: fs.shap_values = get_shap_values( model=selector.estimator_.named_steps.c, X=X, X_train=X) shap_values = parse_shap_values(fs.shap_values) elif fs.method == 'fscore': selector = select_percentile(X, y, percentile=10) elif fs.method == 'relieff': selector = select_relieff(X, y, percentile=10) elif fs.method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format(fs.method)) fs.end_at = get_timestamp() # Log ending timestamp # Update search request with results fs.features = label_support(selector.get_support(), X.columns) if not kwargs.get('save'): return fs return self.dataset_service.append_feature_selection(ds, fs)