def create_model_test(self, *, model: Model, split=0.7, step=None, task_key=None, window=None, **kwargs): service = DatasetService() ds = service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) parameters = kwargs.get('parameters') features = kwargs.get('features') if isinstance(parameters, str) and parameters == 'latest': if model.parameters: parameters = model.parameters[-1].parameters else: parameters = None if isinstance(features, str): fs = DatasetService.get_feature_selection(ds=ds, method=features, target=model.target) if fs: features = fs.features else: features = None result = ModelTest(window=window or {'days': 30}, step=step or ds.interval, parameters=parameters or {}, features=features or [], test_interval=splits['test'], task_key=task_key or str(uuid4())) return result
def create_features_search(self, *, symbol: str, dataset: str, target: str, split: float, method: str, task_key: str = None) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) splits = DatasetService.get_train_test_split_indices(ds, split) result = ModelFeatures(dataset=dataset, target=target, symbol=symbol, search_interval=splits['train'], feature_selection_method=method, task_key=task_key or str(uuid4())) return result
def create_parameters_search(self, model: Model, split: float, **kwargs) -> ModelParameters: ds = self.dataset_service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) # Features can either be a list of features to use, or a string # If it is a string, and it is "latest", pick the latest features = kwargs.get('features') # if isinstance(features, str) and features == 'latest': # if model.features: # features = model.features[-1].features # else: # features = None if features: target = kwargs.get('target', 'class') mf = DatasetService.get_feature_selection( ds=ds, method=kwargs.get('features'), target=target) if not mf: raise MessageException( f"Feature selection not found for {model.dataset}.{model.symbol} -> {target}!" ) features = mf.features # Determine K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 k = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / k < X: k = 3 # If samples are still too low, raise a value error if ds.count / k < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") result = ModelParameters(cv_interval=splits['train'], cv_splits=k, task_key=kwargs.get('task_key', str(uuid4())), features=features or None) return result
def grid_search_new(self, symbol: str, dataset: str, target: str, pipeline: str, split: float, feature_selection_method: str, **kwargs): # Check if a model exists and has same search method existing_model = self.model_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) if existing_model: mp_exists = ModelService.get_model_parameters(existing_model, method='gridsearch') if mp_exists: if kwargs.get('replace'): self.model_service.remove_parameters(model=existing_model, method='gridsearch') else: if kwargs.get('save'): raise MessageException( f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}" ) # Retrieve dataset to use ds = self.dataset_service.get_dataset(dataset, symbol) # Determine cv_splits=K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 cv_splits = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / cv_splits < X: cv_splits = 3 # If samples are still too low, raise a value error if ds.count / cv_splits < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") # Determine split indices based on dataset splits = DatasetService.get_train_test_split_indices(ds, split) cv_interval = splits['train'] # Load dataset features by applying a specified feature selection method X = self.dataset_service.get_dataset_features( ds=ds, begin=cv_interval['begin'], end=cv_interval['end'], method=feature_selection_method, target=target) y = self.dataset_service.get_target( name=target, symbol=symbol, begin=cv_interval['begin'], end=cv_interval['end'], ) # Check number of samples for each class in training data, if less than 3 instances are present for # each class, we're going to get a very unstable model (or no model at all for k-NN based algos) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline algorithm and parameter grid pipeline_module = get_pipeline(pipeline) # Perform search gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) mp = ModelParameters(cv_interval=splits['train'], cv_splits=cv_splits, task_key=kwargs.get('task_key', str(uuid4())), features=[c for c in X.columns], parameter_search_method='gridsearch') mp.start_at = get_timestamp() gscv.fit(X, y) mp.end_at = get_timestamp() # Collect results results_df = pd.DataFrame(gscv.cv_results_) mp.parameters = gscv.best_params_ mp.cv_results = results_df.loc[:, results_df.columns != 'params'].to_dict( 'records') tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline, dict_hash(mp.parameters)) mp.result_file = 'cv_results-{}.csv'.format(tag) # Is there an existing model for this search? model = Model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol, features=feature_selection_method) model.parameters.append(mp) self.model_repo.create(model) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) return mp
def test_model_new(self, *, pipeline: str, dataset: str, symbol: str, target: str, split=0.7, step=None, task_key=None, window=None, **kwargs): test_window = window or {'days': 90} model = self.get_model(pipeline=pipeline, dataset=dataset, symbol=symbol, target=target) # for t in enumerate(model.tests): # if t['window']['days'] == test_window['days']: # if not kwargs.get('force'): # logging.info(f"Model {pipeline}({dataset}.{symbol}) -> {target} " # f"test with window {test_window} already executed!") # if kwargs.get('save'): # return t ds = self.dataset_service.get_dataset(dataset, symbol) splits = DatasetService.get_train_test_split_indices(ds, split) test_interval = splits['test'] test_step = step or ds.interval # Parse model parameters: if it's a string, give it an interpretation parameters = kwargs.get('parameters') features = kwargs.get('features') mp = ModelService.get_model_parameters(m=model, method=parameters) if not mp: logging.warning( f"Parameter search with method {parameters} does not exist in model" f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}" ) # Get training data including the first training window begin = sub_interval(timestamp=test_interval["begin"], interval=test_window) end = add_interval(timestamp=test_interval["end"], interval=test_step) if from_timestamp(ds.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException( f"Not enough data for training with window {test_window}!" f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}" ) test_X, test_y = self.dataset_service.get_x_y(dataset, symbol, target, features, begin, end) # Slice testing interval in "sliding" windows windows = [ (b, e) for b, e in timestamp_windows(begin, end, test_window, test_step) ] # Fit the models and make predictions storage_service.create_bucket(bucket='fit-estimators') _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2)) logging.info( f"Fitting {len(windows)} estimators with {_n_jobs} threads..") fit_estimators = Parallel(n_jobs=_n_jobs)( delayed(fit_estimator_new)(model=model, mp=mp, features=features, day=e, window=test_window, X=test_X, y=test_y, b=b, e=e, force=not kwargs.get('save')) for b, e in tqdm(windows)) logging.info( f"Saving {len(windows)} fit estimators with {_n_jobs} threads..") estimator_names = Parallel(n_jobs=_n_jobs)( delayed(save_estimator)(estimator=est, ) for est in tqdm(fit_estimators)) # logging.info(f"Loading {len(windows)} estimators with {_n_jobs} threads..") # load_estimators = Parallel(n_jobs=_n_jobs)( # delayed(load_estimator)( # model=model, # day=e, # window=window, # parameters=parameters, # features=features # ) # for b, e in tqdm(windows)) logging.info( f"Predicing {len(windows)} estimators with {_n_jobs} threads..") prediction_results = Parallel(n_jobs=_n_jobs)( delayed(predict_estimator_day)(estimator=est, day=est.day, X=test_X[est.begin:est.end], y=test_y[est.begin:est.end]) for est in tqdm(fit_estimators)) results = [r for r in prediction_results if r is not None] df = pd.DataFrame(results) if df.empty: raise MessageException("TestWindows: Empty result dataframe!") #df.time = pd.to_datetime(df.time) #df = df.set_index('time') classification_records = [r for r in df.to_dict(orient='records')] # If save is true, save test instance and parameters mt = ModelTest( window=test_window, step=test_step, parameters=mp.parameters, features=[c for c in test_X.columns], test_interval=splits['test'], task_key=task_key or str(uuid4()), classification_results=classification_records, ) # Populate classification report fields clf_report = flattened_classification_report_imbalanced( df.label, df.predicted) roc_report = roc_auc_report( df.label, df.predicted, df[[c for c in df.columns if '_proba_' in c]]) clf_report.update(roc_report) mt.classification_report = clf_report # Save test into the model if kwargs.get('save'): return self.model_repo.append_test(model.id, mt) return mt
def feature_selection_new(self, *, symbol: str, dataset: str, target: str, split: float, method: str, **kwargs) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) fs_exists = DatasetService.has_feature_selection(ds=ds, method=method, target=target) if fs_exists: if kwargs.get('replace'): self.dataset_service.remove_feature_selection(ds=ds, method=method, target=target) else: if kwargs.get('save'): raise MessageException( f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'" ) splits = DatasetService.get_train_test_split_indices(ds, split) fs = FeatureSelection(target=target, method=method, search_interval=splits['train'], task_key=kwargs.get('task_key', str(uuid4()))) # Load dataset X = self.dataset_service.get_dataset_features( ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) y = self.dataset_service.get_dataset_target( name=fs.target, ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search fs.start_at = get_timestamp() # Log starting timestamp if not fs.method or 'importances' in fs.method: if '_cv' in fs.method: selector = select_from_model_cv(X, y) else: selector = select_from_model(X, y) fs.feature_importances = label_feature_importances( selector.estimator_, X.columns) if '_shap' in fs.method: fs.shap_values = get_shap_values( model=selector.estimator_.named_steps.c, X=X, X_train=X) shap_values = parse_shap_values(fs.shap_values) elif fs.method == 'fscore': selector = select_percentile(X, y, percentile=10) elif fs.method == 'relieff': selector = select_relieff(X, y, percentile=10) elif fs.method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format(fs.method)) fs.end_at = get_timestamp() # Log ending timestamp # Update search request with results fs.features = label_support(selector.get_support(), X.columns) if not kwargs.get('save'): return fs return self.dataset_service.append_feature_selection(ds, fs)