def _impl(self, *method_args, **method_kwargs): result = method(self, *method_args, **method_kwargs) for key, value in conf.DEFAULT_CONFIG_VALUES.items(): if not getattr(self, key, None): setattr(self, key, value) l.debug(f'Using default value {value} of {key}') return result
def init_with_models(self, model_objects: List[ModelCollection], description=None): # check that models are compatible between each other # code is the same code = model_objects[0].code deal_type = model_objects[0].config['DEAL_TYPE'] for model_obj in model_objects: if code != model_obj.code: raise WrongParameter( 'All the models in ensemble shall have the same code') if deal_type != model_objects[0].config['DEAL_TYPE']: raise WrongParameter( 'All the models in ensemble shall have the same deal type') i = 0 self.models = [] for model_obj in model_objects: self.models.append( ModelInterface.load_from_mongo_object(model_obj)) i += 1 self.obj = ModelCollection() self.obj.model_id = f'{code}.{self._generate_model_id()}' self.obj.status = 'READY' self.obj.model_type = ModelTypeMatcher.get_type_name(self) self.obj.created_at = datetime.now(timezone.utc) self.obj.code = code self.obj.description = description self.obj.model_ensemble = model_objects l.debug( f'Ensemble model is created with id {self.obj.model_id}. It has {i} models in ensemble' ) return self
def _preprocess(self): local_conf = self.config l.debug(f'Start preprocessing of {self.source.code}') # l.debug(f'Used memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}') dft = self.source.df.copy() # l.debug(f'Used memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}') # this shift is done to be able to do prediction every minute. Needed for inference phase timeshift = -1 * dft.index.max( ).minute % local_conf.PREPROCESS_PERIOD_MINUTES # this shift is done to catch all the configurations. Don't use while inference dft.index = dft.index + pd.Timedelta( minutes=self.shift) + pd.Timedelta(minutes=timeshift) # l.debug(f'Used memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}') df_temp = dft.pipe(self.to_candles, f'{local_conf.PREPROCESS_PERIOD_MINUTES}T') # l.debug(f'Used memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}') if self.last_n_steps > 0: df_temp = df_temp[-self.last_n_steps:] # l.debug(f'Used memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}') df_temp = (df_temp.assign( weekday=df_temp.index.weekday, month=df_temp.index.month, hour=df_temp.index.hour).pipe(self.add_ta_features).pipe( self.add_days_to_expiration, self.source.code)) df_temp.index = df_temp.index - pd.Timedelta(minutes=timeshift) return df_temp
def _criterion(self, result): file_name = 'stop' if os.path.isfile(file_name): os.remove(file_name) # will remove a file. l.debug('Stop file is found. Stopping optimization task') return True return False
def init_from_point(self, point_object=None, point_object_id=None): if not self.obj is None: raise BadLogic( 'Cannot call init_from_point on a used model instance') if not point_object is None: p = point_object elif not point_object_id is None: p = Point.objects_safe(pk=point_object_id).first() else: raise ValueError( 'At least one of point_object or point_object_id must be provided' ) code = p.experiment.code or list( p.experiment.data['DATASOURCES'].keys())[0][:2] self.obj = ModelCollection() self.obj.model_id = f'{code}.{self._generate_model_id()}' self.obj.status = 'READY' self.obj.model_type = ModelTypeMatcher.get_type_name(self) self.obj.point = p self.obj.experiment = p.experiment.id self.obj.step = p.step self.exp_config = ExperimentConfiguration().load_by_point_object(p) self.obj.config = self.exp_config._as_dict self.obj.created_at = datetime.now(timezone.utc) self.obj.code = code l.debug(f'model is loaded from the point {p.id}')
def load_from_json_file(self, fpath): self._update_revision() l.debug(f'reading config from {fpath}') j = json.load(open(fpath, 'r')) for key, val in j.items(): setattr(self, key, val) self.loaded_keys.union(set(j.keys())) self.adjusted_parameters = []
def upload_binary(self, binary_data, key): response = self.client.put_object(Bucket=self.bucket, Body=binary_data, Key=str(key)) status_code = response['ResponseMetadata']['HTTPStatusCode'] l.debug( f'file uploaded to s3 with key {key}, response status is {status_code}' )
def do_predictions(self): l.debug(f'{self.code}: Doing predictions') self.source.update() for model in self.loaded_models: model.update_attached_data(update_source=False) model.trade_signal = model.predict_latest()[0] if model.trade_signal: l.debug(f'{self.code}: got deal recomendation for the model {model._id}/{model.config.DEAL_TYPE}')
def load_from_experiment_object(self, experiment_object): self._update_revision() ex = experiment_object self._load_from_space_dict(ex.space) self._load_from_dict(ex.data) self._load_from_dict({'MODEL_SPACE': experiment_object.model_space}) l.debug( f'Configuration for experiment {experiment_object.id} is successfully loaded from DB' )
def load(self): l.debug(f'Loading S3 datasource for code {self.code}') filepath = self.s3.download_if_not_cached_and_get_path( key=f"data/{self.code}.csv.gz") date_format = '%Y-%m-%d %H:%M:%S' dft = pd.read_csv(filepath, index_col='full_date', parse_dates=True) dft.index = pd.to_datetime(dft.index, format=date_format) self._df = dft return self
def load(self): l.debug(f'reading {self._filepath}') date_format = '%Y-%m-%d %H:%M:%S' dft = pd.read_csv(self._filepath, index_col='full_date', parse_dates=True) dft.index = pd.to_datetime(dft.index, format=date_format) self._df = dft return self
def train(self, datasets, tags, model_params=None, model_param_distributions=None, random_search_iterations=8): """ model_params - dict with params for a classifier model_param_distributions - dict with serach ranges for params of model Provide one ond only one of them """ relevant_datasets = filter_datasets_with_tags(datasets, tags) # ==== make id section self._id = f'{relevant_datasets[0].source.code[:2]}.{self._generate_model_id()}' X = np.concatenate([d.X for d in relevant_datasets]) Y = np.concatenate([d.Y for d in relevant_datasets]) if X.shape[0] == 0: raise NoTrainingData if not model_param_distributions is None and not model_params is None: raise ValueError( 'Only one of model_params and model_param_distributions must be provided' ) if model_params: l.debug('Init Random forest classifier') clf = RandomForestClassifier(**model_params) elif model_param_distributions: l.debug( f'Init RandomSearch of Random forest config. Space is {model_param_distributions}' ) # sn1 = tracemalloc.take_snapshot() clf = RandomizedSearchCV( RandomForestClassifier(), param_distributions=model_param_distributions, cv=TimeSeriesSplit(n_splits=2), iid=False, n_jobs=conf.SK_JOBS, pre_dispatch=1, n_iter=random_search_iterations, random_state=143, verbose=1) # connect_to_mongo() # reconnect after a long operation else: raise ValueError( 'At least one of model_params and model_param_distributions must be provided' ) l.debug('Start fit of classifier') start = time.time() clf.fit(X, Y) l.debug(f'Training a classifier took {time.time() - start}.') if model_param_distributions: l.debug(f'Best params are {clf.best_params_}') self.clf = clf
def save_to_cloud(self): """ Called when model was created locally and it's cloud representation to be created """ self.obj.save_safe() l.debug(f'Model with id={self.obj.model_id } is saved') return self.obj.model_id
def adjust_parameters(self, **kwargs): self._update_revision() for k, v in kwargs.items(): if hasattr(self, k): v = int(v) if (v % 1 == 0) else v l.debug(f"{k} is updated {getattr(self, k)} --> {v}") setattr(self, k, v) self.adjusted_parameters.append(k) else: l.debug(f"{k} is not present among attributes, ignoring it")
def drop_from_cloud(cls, model_id: str = None, object_id=None) -> bool: obj = None if not model_id is None: l.debug(f'Loading model with model_id={model_id}') obj = ModelCollection.objects_safe(model_id=model_id).first() elif not object_id is None: l.debug(f'Loading model with object_id={object_id}') obj = ModelCollection.objects_safe(pk=object_id).first() if not obj: raise RecordNotFound('Cannot find model') ModelTypeMatcher.get_class(obj.model_type).drop_from_cloud(obj) return True
def predict(code): if request.method == 'GET': l.debug(f'got prediction request on {code}') if code not in inf_packs: return json.dumps( dict(code=200, message=f"No models for the code {code}", advices=[])) inf_pack: InferencePack = inf_packs[code] inf_pack.do_predictions() return inf_pack.json_response
def __call__(self, result): """ Parameters ---------- * `res` [`OptimizeResult`, scipy object]: The optimization as a OptimizeResult object. """ self.result = result curr_value = result.func_vals[-1] best_value = np.min(result.func_vals) if curr_value == best_value: l.debug(f'Best result {best_value} is achieved') self.do_if_best_result(best_value)
def _rebalance(self): """ Stage #2.1: rebalance """ conf = self.config old_size = self._data.shape[0] df_pos = self._data[self._data[conf.MAIN_LABEL].astype(bool)] df_neg = self._data[~self._data[conf.MAIN_LABEL].astype(bool)] if df_neg.shape[0] == 0: l.debug( f'{self.source.code}: {old_size}, No negative samples. No rebalancing will be done' ) return self._data if df_pos.shape[0] == 0: l.debug( f'{self.source.code}: {old_size}, No negative samples. No rebalancing will be done' ) return self._data if df_neg.shape[0] <= conf.LABEL_DEAL_NO_DEAL * df_pos.shape[0]: l.debug( f'{self.source.code}: {old_size}, n={df_neg.shape[0]}, p={df_pos.shape[0]}. No rebalancing is necessary' ) return self._data number_of_negative_samples = conf.LABEL_DEAL_NO_DEAL * df_pos.shape[0] df_neg = df_neg.sample(n=number_of_negative_samples) df = pd.concat([df_pos, df_neg]) l.debug( f'{self.source.code}: {old_size}-->{df.shape[0]}, n={df_neg.shape[0]}, p={df_pos.shape[0]}' ) return df
def train_preloaded(self, datasets=None): if datasets is None: datasets_local = get_datasets_from_exp_config(self.exp_config) else: datasets_local = datasets self.train(datasets_local, 'ALL', model_params=self.obj.point.clf_params) self.obj.status = 'TRAINED' l.debug(f'Model is trained with preloaded setup') self.was_trained = True file_key = conf.AWS_MODELS_FOLDER + self.obj.model_id + '.pkl' model_file_str = f's3:{file_key}'
def remove_file(self, key, also_from_cache=True): _ = self.client.delete_object(Bucket=self.bucket, Key=str(key)) # if not response['DeleteMarker']: # raise ValueError(f'Cannot delete key {key} for s3 ') if also_from_cache: filename = conf.AWS_LOCAL_CACHE_FOLDER + key my_file = Path(filename) if my_file.is_file(): # if we have cached file already os.remove(filename) # will remove a file. l.debug(f'removed file {filename}')
def save_to_cloud(self): """ Called when model was created locally and it's cloud representation to be created """ if self.read_only: raise WrongParameter('Cannot save model in read only mode') if self.was_trained: file_key = conf.AWS_MODELS_FOLDER + self.obj.model_id + '.pkl' self.s3.upload_binary(pickle.dumps(self.clf), file_key) self.obj.model_file = f's3:{file_key}' self.obj.save_safe() l.debug(f'Model with id={self.obj.model_id } is saved') return self.obj.model_id
def pick_one_experiment_and_run(): experiment = mongo.Experiment.objects( pk='5cdd609fd71f100c646bde55').first() #GD # experiment = mongo.Experiment.objects(pk='5cdba572d71f102dbd7b03c4').first() #RI if not experiment: l.debug("There are no experiments found") return experiment.status = 'IN_PROCESS' experiment.executor = os.uname().nodename experiment.exp_id = str(int(time.time())) experiment.started_at = datetime.now(timezone.utc) experiment.save() # Read search space search_space = make_search_space(experiment.space) # Init new experiment configuration instance ec = classes.ExperimentConfiguration() initial_space = experiment.space ec.load_from_experiment_object(experiment) # Init new experiment pipeline instance ep = classes.ExperimentPipeline(experiment_configuration=ec) @use_named_args(search_space) def execute_experiment(**params): ec._load_from_space_dict(initial_space) ec.adjust_parameters(**params) ep.config = ec return ep.run() mongo_saver = callbacks.MongoSaver(ec, ep, experiment, search_space=search_space) res_gp = gp_minimize(execute_experiment, search_space, callback=[mongo_saver], verbose=True, **experiment.optimization_config.values) experiment.status = 'DONE' experiment.finished_at = datetime.now(timezone.utc) experiment.save()
def _load_self_from_mongo_object(self, obj: ModelCollection): if not self.obj is None: raise BadLogic('Cannot load model instance twice') self.obj = obj model_ensemble = getattr(obj, 'model_ensemble', None) if model_ensemble is None or len(obj.model_ensemble) == 0: raise MalformedModel('Model has to have "model_ensemble" field') if len(obj.model_ensemble) == 1: l.warning( f'Strange ensemble, that has only one model. Obj.id={obj.id}') self.models = [] i = 0 for model_obj in model_ensemble: self.models.append( ModelInterface.load_from_mongo_object(model_obj)) i += 1 l.debug( f'Successfully loaded ensemble model, that has {i} members. obj.id={obj.id}' )
def pick_one_experiment_and_fine_tune(): experiment = mongo.Experiment.objects_safe(status='2TUNE').first() if experiment is None: l.debug('No experiments in status 2TUNE are found') return else: l.debug( f'Processing experiment {experiment.id}, name {experiment.name} ') experiment.status = 'FINETUNING_IN_PROCESS' if not experiment.model_tuning_params: experiment.model_tuning_params = conf.DEFAULT_TUNING_SPACE experiment.save_safe() try: fine_tune_experiment(experiment.id, **experiment.model_tuning_params) except NoTrainingData: experiment.status = 'TUNING_ERROR' experiment.status_message = ('Got NoTrainingData exception') except MissingConfigValues: experiment.status = 'TUNING_ERROR' experiment.status_message = (f'Got MissingConfigValues: {e.args[0]}') except MalformedExperiment as e: l.error(e.args[0]) experiment.status = 'TUNING_ERROR' experiment.status_message = (f'Got MalformedExperiment: {e.args[0]}') else: experiment.status = 'DONE_AND_TUNED' l.debug('Experiment is successfully tuned') finally: experiment.save_safe()
def to_candles(df, period): l.debug(f'period={period}') ds_vol = df.vol.resample(period).sum().dropna().astype(int) ds_deals = df.vol.resample(period).count().dropna().astype(int) ds_price = (df.vol * df.price).resample(period).sum().dropna() / ds_vol ds_price = ds_price.fillna(method='backfill').astype(int) df_candles = ( df.price.resample(period).ohlc().dropna().astype(int).assign( vol=ds_vol).assign(deals=ds_deals).assign( avg_deal=ds_vol / ds_deals).assign(price=ds_price)) df_candles = (df_candles.assign(candle_body=df_candles.apply( lambda r: r.close - r.open, axis=1)).assign( upper_shadow=df_candles.apply( lambda r: r.high - max(r.close, r.open), axis=1)).assign( lower_shadow=df_candles.apply( lambda r: min(r.close, r.open) - r.low, axis=1))) for candle_size in [3, 7]: df_candles[ f'candle_body_{candle_size}'] = df_candles.close - df_candles.open.shift( candle_size) return df_candles
def __init__(self, code): self.loaded_models = [] self.code = code l.debug(f'{code}: Preparing inference pack') if ModelCollection.objects(status='DEPLOYED', code=self.code[:2]).count() > 0: # load data source l.debug(f'{code}: load data source') self.source = DBDataSource(code).load() # load models self.loaded_models = [] for obj in ModelCollection.objects(status='DEPLOYED', code=self.code[:2]): model_instance = ModelInterface.load_from_mongo_object(obj) model_instance.attach_data(self.dataset_factory) self.loaded_models.append(model_instance) l.debug(f'{code}: found {len(self.loaded_models)} models in status DEPLOYED') else: l.debug(f'{code}: no models found in status DEPLOYED')
def download_if_not_cached_and_get_path(self, key): l.debug(f"received key: {key}") filename = conf.AWS_LOCAL_CACHE_FOLDER + key my_file = Path(filename) if not my_file.is_file(): # if we don't have cached file already l.debug('File is not found. Downloading') response = self.client.get_object(Bucket=self.bucket, Key=str(key)) data = response['Body'].read() self._makedirs(filename) with open(filename, "wb") as f: f.write(data) else: l.debug('File is already cached') return filename
def load_from_cloud(cls, model_id: str = None, object_id=None) -> AbstractModel: obj = None if not model_id is None: l.debug(f'Loading model with model_id={model_id}') obj = ModelCollection.objects_safe(model_id=model_id).first() elif not object_id is None: l.debug(f'Loading model with object_id={object_id}') obj = ModelCollection.objects_safe(pk=object_id).first() if not obj: raise RecordNotFound('Cannot find model') model_instance = cls.load_from_mongo_object(obj) l.debug(f'Model is loaded successfully') return model_instance
def get_datasets_from_exp_config(ec: ExperimentConfiguration) -> List[Dataset]: if not hasattr(ec, 'DATASOURCES'): raise ValueError('Experiment configuration does not have DATASOURCES') datasource_shift = getattr(ec, 'DATASOURCE_SHIFT', 0) if datasource_shift > 0: # make preload data sources datasources_dict = { source_string: Dataset._get_datasource(source_string=source_string, default_source_type=getattr( ec, 'DEFAULT_SOURCE_TYPE', None)).load() for source_string in ec.DATASOURCES } l.debug(f'Got DATASOURCE_SHIFT equal to {datasource_shift}') datasets = [] for shift in range(0, ec.PREPROCESS_PERIOD_MINUTES, datasource_shift): l.debug(f'Processing with shift={shift}') datasets = datasets + [ Dataset( data_source=datasources_dict[source_string], experiment_configuration=ec, add_label=True, turn_on_rebalancing='TRAIN' in tags, # do rebalancing for train only tags=tags, shift=shift) for source_string, tags in ec.DATASOURCES.items() ] else: datasets = [ Dataset( source_string=source_string, experiment_configuration=ec, add_label=True, turn_on_rebalancing='TRAIN' in tags, # do rebalancing for train only tags=tags) for source_string, tags in ec.DATASOURCES.items() ] l.debug(f'{len(ec.DATASOURCES)} sources --> {len(datasets)} datasets') return datasets
def fine_tune_experiment(exp_obj_id, model_space, top_mean=0, top_diff=0, remove_prev_results=False, minimal_deals_per_day=0.2, search_iterations=80): l.debug(f'Fine tuning experiment. Space is {model_space}') model_param_distributions = {} for k, v in model_space.items(): if v['type'] == 'int': model_param_distributions[k] = sp_randint(v['bounds'][0], v['bounds'][1]) if top_mean == 0 and top_diff == 0: l.warning('No finetuning is done. Provide top_profit or top_diff') return if remove_prev_results: for p in mongo.Point.objects_safe(experiment=exp_obj_id, fine_tuned=True): p.delete() already_fine_tuned = [] else: already_fine_tuned = [ p.step for p in mongo.Point.objects_safe(experiment=exp_obj_id, fine_tuned=True).only('step') ] top_mean_points = mongo.Point.objects_safe( experiment=exp_obj_id, fine_tuned__in=[None, False], test_deals_per_day__gt=minimal_deals_per_day, step__nin=already_fine_tuned).order_by('-test_mean').limit(top_mean) top_diff_points = mongo.Point.objects_safe( experiment=exp_obj_id, fine_tuned__in=[None, False], test_deals_per_day__gt=minimal_deals_per_day, step__nin=already_fine_tuned).order_by('-test_diff').limit(top_diff) ec = classes.ExperimentConfiguration() ec.load_from_experiment_step( exp_obj_id, 1, fine_tuned=False) # load initial config that we will update later on datasets = None processed_points = [] for points_set in [top_mean_points, top_diff_points]: for p in points_set: if p.id in processed_points: continue processed_points.append(p.id) ec.adjust_parameters(**p.coordinates) # assume that there are same datasets used within entire experiment if datasets == None: datasets = classes.SKModel.get_datasets_from_exp_config(ec) else: for d in datasets: d.update(update_source=False) model = classes.SKModel(ec) model.train(datasets=datasets, tags='TRAIN', model_param_distributions=model_param_distributions, random_search_iterations=search_iterations) result_test, result_list_test = model.evaluate(datasets, tags='TEST') result_val, result_list_val = model.evaluate(datasets, tags='VAL') point = classes.Point( step=p.step, evaluation_on_test=result_test.mongo_object_with_deals, evaluation_on_val=result_val.mongo_object, detailed_evaluation_on_val=[ r.mongo_object for r in result_list_val ], detailed_evaluation_on_test=[ r.mongo_object for r in result_list_test ], coordinates=p.coordinates, experiment=exp_obj_id, test_days=result_test.days, test_mean=result_test.mean, test_std=result_test.std, test_deals_per_day=result_test.deals_per_day, test_diff=result_test.diff, test_min=result_test.min, test_max=result_test.max, test_total=result_test.total, val_days=result_val.days, val_mean=result_val.mean, val_std=result_val.std, val_deals_per_day=result_val.deals_per_day, val_diff=result_val.diff, val_min=result_val.min, val_max=result_val.max, val_total=result_val.total, clf_params=getattr(model.clf, 'best_params_', None), fine_tuned=True) point.save_safe()