def main(frac: float = default_sample_frac): """ Creates a sample of the solar wind data and saves it. # Parameters frac: `float` should be between 0.0 and 1.0 and represent the proportion of the dataset to include on the sample dataset. """ logging.info(f'making sample of {frac}%') logging.info('reading config file') config = load_data.read_config_file('./config/config.yml') directories = config['directories'] interim_path = Path(directories['interim']) # reading gt data logging.info('reading training data') solar_wind = load_data.read_feather(interim_path / 'solar_wind.feather') logging.info('splitting dataset') _, valid_idx = load_data.split_train_data(solar_wind, test_frac=frac, eval_mode=True) sample_data = solar_wind.loc[valid_idx, :] sample_data.reset_index(drop=True, inplace=True) logging.info('saving file..') sample_data.to_feather(interim_path / 'sample_solar_wind.feather')
def main(): """ This function will save the solar wind data as a Feather file """ # read the main config file config = load_data.read_config_file('./config/config.yml') # get the path to the CSV File directories = config['directories'] raw_path = Path(directories['raw']) interim_path = Path(directories['interim']) interim_path.mkdir(exist_ok=True, parents=True) logging.info('reading solar wind data..') # reading CSV file solar_wind = load_data.read_csv(raw_path / 'solar_wind.csv') logging.info('saving to feather..') # saving as feather file solar_wind.to_feather(interim_path / 'solar_wind.feather')
def main(use_sample: bool = False, n_jobs: int = 1): """ This function will apply all the steps in order to create a dataset ready to train models. The following steps: - read the data - compute the solar wind features - compute sattelite positions features - take the log of smoothed_ssn values - create the target for the actual time t and t + 1 hour - merge all dataset into a single one - save the dataset for future modeling # Params use_sample: `bool`, optional(defualt=False) Whether or not to use the sample dataset n_jobs: `in`, optional(defualt=1) The number of jobs to run in parallel """ logging.info(f'use_sample={use_sample}, n_jobs={n_jobs}') logging.info('reading config file') config = load_data.read_config_file('./config/config.yml') # directories directories = config['directories'] raw_path = Path(directories['raw']) interim_path = Path(directories['interim']) processed_path = Path(directories['processed']) processed_path.mkdir(exist_ok=True, parents=True) # reading gt data solar_wind_file = ('sample_solar_wind.feather' if use_sample else 'solar_wind.feather') logging.info('reading training data') dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv') solar_wind = load_data.read_feather(interim_path / solar_wind_file) sunspots = load_data.read_csv(raw_path / 'sunspots.csv') stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv') logging.info('preprocessing solar wing') # preprocessing solar wind # setting timedelta as index solar_wind.set_index('timedelta', inplace=True) # preprocessing solar wind time series solar_wind = solar_wind_preprocessing(solar_wind) logging.info('computing features') start = time.time() # computing solar wind features data = split_into_period(solar_wind, features=default.init_features, n_jobs=n_jobs) elapsed_time = (time.time() - start) / 60 logging.info(f'elapsed time {elapsed_time:.4f}') logging.info('merging other datasets') # create target target = create_target(dst_labels) # preprocessing sattelite positions stl_pos = stl_preprocessing(stl_pos) # taking the log of smoothed_ssn values sunspots['smoothed_ssn'] = np.log(sunspots['smoothed_ssn']) # merging dataframes to the main dataframe data = merge_daily(data, stl_pos) data = merge_daily(data, sunspots) # merging target dataframe to the main dataframe data = data.merge(target, how='left', on=['period', 'timedelta']) # droping last values where there is not available data data.dropna(subset=['t0', 't1'], inplace=True) # reset index data.reset_index(inplace=True, drop=True) logging.info('saving') output_filename = 'fe' if not use_sample else 'fe_sample' # saving to feather format data.to_feather(processed_path / f'{output_filename}.feather')
def main(experiment_path: str, eval_mode: bool = True, use_sample: bool = False, test_frac: float = 0.2, message: str = None, fi_threshold: float = None): """ A function to train or validate an Experiment # Parameters experiment_path: `str` A path to the folder's experiment config file. the config file must be named config.yml and it must contain the following keys: model: `str` the path to the model config file pipeline: `str` the path to the pipeline config file optimizer: `Dict[str, Any]` the parameters for the Adam optimizer epochs: `int`, optinal(default=10) the number of epochs to train the model use_sigmoid: `bool`, optional(defualt=False) Whether or not to use as the final activation function of the model eval_mode: `bool`, optional (default=True) if False, the model will be train using all the data available, otherwise, the trained model will be use for inference. use_sample: `bool`, optional (default=False) if True, we will use only a sample from the dataset. to use it, before execute the make_sample.py file to create this sample dataset. test_frac: `float`, optional (default=0.2) if eval_mode is True, the size of the valid dataset will be the {test_frac}% of the main dataset. message: `str`, optional (default=None) we use mlflow to keep track of all parameters and errors of each experiment, this parameter will register any string you pass into the experiment record in mlflow. fi_threshold `float`, optional (default=None) if already exists a feature importance file, this value will be use for filtering the features that has greater importance values than {fi_threshold}. """ # getting experiment name experiment = os.path.basename(experiment_path) logging.info(f'running {experiment}') logging.info(f'eval_mode={eval_mode}, use_sample={use_sample}') logging.info('reading config file') # creating experiment path and loading experiment config file experiment_path = Path(experiment_path) config = load_data.read_config_file('./config/config.yml') experiment_config = load_data.read_config_file(experiment_path / 'config.yml') # reading experiment's model and pipeline config file pipeline_config = load_data.read_config_file(experiment_config['pipeline']) model_config = load_data.read_config_file(experiment_config['model']) directories = config['directories'] # getting the data path processed_path = Path(directories['processed']) # creating a prediction folder to save prediction after training prediction_path = experiment_path / 'prediction' prediction_path.mkdir(exist_ok=True, parents=True) # creating a model path to save models after training model_path = experiment_path / 'models' # reading preprocessed data filename = ('fe' if not use_sample else 'fe_sample') logging.info('reading training data') data = load_data.read_feather(processed_path / f'{filename}.feather') logging.info('splitting dataset') train_idx, valid_idx = load_data.split_train_data(data, test_frac=test_frac, eval_mode=eval_mode) train_data = data.loc[train_idx, :] valid_data = data.loc[valid_idx, :] train_data.reset_index(drop=True, inplace=True) valid_data.reset_index(drop=True, inplace=True) # importing pipeline logging.info('building pipeline') pipeline = build_pipeline(pipeline_config) logging.info(f'{pipeline}') # fit pipeline logging.info('training pipeline') pipeline.fit(train_data) # transform both training and valid dataset logging.info('transforming datasets') train_data = pipeline.transform(train_data) valid_data = pipeline.transform(valid_data) # getting the bottom and upper limit of the target # in the case we want to use the sigmoid function # as the final activation function of our model use_sigmoid = experiment_config.pop('use_sigmoid', False) y_limit = ((train_data['t0'].agg( ('max', 'min')) * 1.2).to_list() if use_sigmoid else None) # loading the features to train our model # if exists a feature importance file # we can use it to train our model only with revelant features features = load_data.get_features(train_data, experiment_path=experiment_path, fi_threshold=fi_threshold, ignore_features=default.ignore_features) in_features = len(features) logging.info(f'modeling using {len(features)} features') logging.info(f'{features[:30]}') # creating datasets train_ds = Dataset.from_dataframe(train_data, features=features, target=target_name, device=device) valid_ds = Dataset.from_dataframe(valid_data, features=features, target=target_name, device=device) # creating dataloaders train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True) not_shuffle_train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=False) valid_dl = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=False) # creating databunch bunch = DataBunch(train_dl, valid_dl) # importing the model instance model_instance = model_library[model_config['instance']] # init the model model = model_instance(in_features=in_features, out_features=len(target_name), y_limit=y_limit, **model_config['parameters']).to(device=device) # init optimizer optimizer = optim.Adam(model.parameters(), **experiment_config['optimizer']) # creating learner instance logging.info('creating learner instance') cbs = [ Recoder, MetricRecorderCallBack(metrics.torch_rmse), ModelCheckpointCallBack, ProgressBarCallBack ] learner = Learner(model, optimizer, bunch, callbacks=cbs) logging.info('training model') # importing epochs, default is 10 epochs = experiment_config.pop('epochs', 10) # train the model learner.fit(epochs, seed=2020) # avg the last 5 epochs weights top_models = np.arange(epochs)[-5:] learner.modelcheckpoint.load_averaged_model(top_models) logging.info('prediction h0 and h1 models') # predicting valid_output = predict_dl(learner.model, valid_dl) train_output = predict_dl(learner.model, not_shuffle_train_dl) valid_data[['yhat_t0', 'yhat_t1']] = valid_output['prediction'].numpy() train_data[['yhat_t0', 'yhat_t1']] = train_output['prediction'].numpy() # computing metrics train_error = compute_metrics(train_data, suffix='_train') valid_error = compute_metrics(valid_data, suffix='_valid') train_error_period = compute_metrics_per_period(train_data, suffix='_train') valid_error_period = compute_metrics_per_period(valid_data, suffix='_valid') logging.info('errors') logging.info(f'{train_error}') logging.info(f'{valid_error}') logging.info('period errors') logging.info(f'{train_error_period}') logging.info(f'{valid_error_period}') if eval_mode: with mlflow.start_run(run_name=experiment): # saving predictions train_prediction = train_data.loc[:, default.keep_columns] train_prediction.to_csv(prediction_path / 'train.csv', index=False) # saving training progress learner.metrics_table.to_csv(experiment_path / 'trn_progress.csv', index=False) # saving errors train_error_period.to_csv(experiment_path / 'train_erros.csv', index=False) valid_error_period.to_csv(experiment_path / 'valid_erros.csv', index=False) # valid_prediction = valid_data.loc[:, default.keep_columns] valid_data.to_csv(prediction_path / 'valid.csv', index=False) # saving feature importances if there is aviable fi = permutation_importance(model=learner.model, data=valid_data, features=features, target=target_name, score_func=metrics.rmse) if fi_threshold is None: fi.to_csv(experiment_path / 'fi_h0.csv', index=False) fi.to_csv(experiment_path / 'fi_h1.csv', index=False) # saving to mlflow # saving metrics mlflow.log_metrics(train_error) mlflow.log_metrics(valid_error) # saving model parameters mlflow.log_params(model_config['parameters']) mlflow.log_params(experiment_config['optimizer']) mlflow.log_params({ 'epochs': epochs, 'use_sigmoid': use_sigmoid, 'fi_threshold': fi_threshold, 'in_features': in_features }) tags = { 'use_sample': use_sample, 'model_instance': model_config['instance'], 'experiment': experiment } if message is not None: tags['message'] = message mlflow.set_tags(tags) else: # creating model path test_error = calculate_error_on_test(train_data) test_error = pd.DataFrame([test_error]) test_error.to_csv(experiment_path / 'check_test_error.csv', index=False) model_path.mkdir(exist_ok=True, parents=True) joblib.dump(learner.model, model_path / 'model_h0.pkl') joblib.dump(pipeline, model_path / 'pipeline.pkl') joblib.dump(features, model_path / 'features.pkl')
def main(experiment_path: str, eval_mode: bool = True, use_sample: bool = False, test_frac: float = 0.2, message: str = None, fi_threshold: float = None): """ A function to train or validate an Experiment # Parameters experiment_path: `str` A path to the folder's experiment config file. the config file must be named config.yml and it must contain the following keys: model: `str` the path to the model config file pipeline: `str` the path to the pipeline config file eval_mode: `bool`, optional (default=True) if False, the model will be train using all the data available, otherwise, the trained model will be use for inference. use_sample: `bool`, optional (default=False) if True, we will use only a sample from the dataset. to use it, before execute the make_sample.py file to create this sample dataset. test_frac: `float`, optional (default=0.2) if eval_mode is True, the size of the valid dataset will be the {test_frac}% of the main dataset. message: `str`, optional (default=None) we use mlflow to keep track of all parameters and errors of each experiment, this parameter will register any string you pass into the experiment record in mlflow. fi_threshold `float`, optional (default=None) if already exists a feature importance file, this value will be use for filtering the features that has greater importance values than {fi_threshold}. """ # getting experiment name experiment = os.path.basename(experiment_path) logging.info(f'running {experiment}') logging.info(f'eval_mode={eval_mode}, use_sample={use_sample}') logging.info('reading config file') # creating experiment path and loading experiment config file experiment_path = Path(experiment_path) config = load_data.read_config_file('./config/config.yml') experiment_config = load_data.read_config_file(experiment_path / 'config.yml') # reading experiment's model and pipeline config file pipeline_config = load_data.read_config_file(experiment_config['pipeline']) model_config = load_data.read_config_file(experiment_config['model']) directories = config['directories'] # getting the data path processed_path = Path(directories['processed']) # creating a prediction folder to save prediction after training prediction_path = experiment_path / 'prediction' prediction_path.mkdir(exist_ok=True, parents=True) # creating a model path to save models after training model_path = experiment_path / 'models' # reading preprocessed data filename = ('fe' if not use_sample else 'fe_sample') logging.info('reading training data') data = load_data.read_feather(processed_path / f'{filename}.feather') logging.info('splitting dataset') train_idx, valid_idx = load_data.split_train_data(data, test_frac=test_frac, eval_mode=eval_mode) train_data = data.loc[train_idx, :] valid_data = data.loc[valid_idx, :] train_data.reset_index(drop=True, inplace=True) valid_data.reset_index(drop=True, inplace=True) # importing pipeline logging.info('building pipeline') pipeline = build_pipeline(pipeline_config) logging.info(f'{pipeline}') # fit pipeline logging.info('training pipeline') pipeline.fit(train_data) # transform both training and valid dataset logging.info('transforming datasets') train_data = pipeline.transform(train_data) valid_data = pipeline.transform(valid_data) # loading the features to train our model # if exists a feature importance file # we can use it to train our model only with revelant features features = load_data.get_features(train_data, experiment_path=experiment_path, fi_threshold=fi_threshold, ignore_features=default.ignore_features) in_features = len(features) logging.info(f'modeling using {len(features)} features') logging.info(f'{features[:30]}') # importing model instance model_instance = model_library[model_config['instance']] logging.info('training horizon 0 model') # training model for horizon 0 model_h0 = model_instance(**model_config['parameters']) model_h0.fit(train_data.loc[:, features], train_data.loc[:, 't0']) logging.info('training horizon 1 model') # training model for horizon 1 model_h1 = model_instance(**model_config['parameters']) model_h1.fit(train_data.loc[:, features], train_data.loc[:, 't1']) logging.info('prediction h0 and h1 models') # predicting train_data['yhat_t0'] = model_h0.predict(train_data.loc[:, features]) train_data['yhat_t1'] = model_h1.predict(train_data.loc[:, features]) valid_data['yhat_t0'] = model_h0.predict(valid_data.loc[:, features]) valid_data['yhat_t1'] = model_h1.predict(valid_data.loc[:, features]) # compute errors train_error = compute_metrics(train_data, suffix='_train') valid_error = compute_metrics(valid_data, suffix='_valid') train_error_period = compute_metrics_per_period(train_data, suffix='_train') valid_error_period = compute_metrics_per_period(valid_data, suffix='_valid') logging.info('errors') logging.info(f'{train_error}') logging.info(f'{valid_error}') logging.info('period errors') logging.info(f'{train_error_period}') logging.info(f'{valid_error_period}') if eval_mode: with mlflow.start_run(run_name=experiment): # saving predictions train_prediction = train_data.loc[:, default.keep_columns] train_prediction.to_csv(prediction_path / 'train.csv', index=False) # saving errors train_error_period.to_csv(experiment_path / 'train_erros.csv', index=False) valid_error_period.to_csv(experiment_path / 'valid_erros.csv', index=False) # valid_prediction = valid_data.loc[:, default.keep_columns] valid_data.to_csv(prediction_path / 'valid.csv', index=False) # saving feature importances if there is aviable fi_h0 = feature_importances(model_h0, features) fi_h1 = feature_importances(model_h1, features) if (fi_h0 is not None) and (fi_h1 is not None) and (fi_threshold is None): fi_h0.to_csv(experiment_path / 'fi_h0.csv', index=False) fi_h1.to_csv(experiment_path / 'fi_h1.csv', index=False) # saving to mlflow # saving metrics mlflow.log_metrics(train_error) mlflow.log_metrics(valid_error) mlflow.log_params({ 'fi_threshold': fi_threshold, 'in_features': in_features }) # saving model parameters mlflow.log_params(model_config['parameters']) tags = { 'use_sample': use_sample, 'model_instance': model_config['instance'], 'experiment': experiment } if message is not None: tags['message'] = message mlflow.set_tags(tags) else: # creating model path test_error = calculate_error_on_test(train_data) test_error = pd.DataFrame([test_error]) test_error.to_csv(experiment_path / 'check_test_error.csv', index=False) model_path.mkdir(exist_ok=True, parents=True) joblib.dump(model_h0, model_path / 'model_h0.pkl') joblib.dump(model_h1, model_path / 'model_h1.pkl') joblib.dump(pipeline, model_path / 'pipeline.pkl') joblib.dump(features, model_path / 'features.pkl')