def main(): """Ensemble all models inside the experiments folder""" # we assume all the experiments are saved # in the experiments folder path = Path('experiments') # get a list of all experiments name experiment_list = os.listdir(path) assert len(experiment_list) > 1, \ 'there is not enough experiments to ensemble' predictions = [] # for every experiment for experiment in experiment_list: # create a path to the valid prediction file path_to_pred = path.joinpath(experiment, 'prediction', 'valid.csv') if not os.path.exists(path_to_pred): continue # if this file exists, we read it and # set the experiment column to the name of this experiment pred_exp = load_data.read_csv(path_to_pred) pred_exp = pred_exp.assign(experiment=experiment) predictions.append(pred_exp) # concat all the predictions predictions = pd.concat(predictions) # create the target by dropping all duplicates target = predictions.drop_duplicates(subset=['period', 'timedelta']) target.reset_index(drop=True, inplace=True) target.drop(columns=default.yhat, inplace=True) # ensemble predictions_ensemble = ensemble(predictions) target_ensemble = target.merge(predictions_ensemble, on=['period', 'timedelta'], how='left') # check there is non nan values assert target_ensemble[default.yhat].isna().sum().sum() == 0 # compute the metrics ensemble_metrics = compute_metrics(target_ensemble) experiment_list = list(predictions['experiment'].unique()) ensemble_metrics['experiment'] = '__'.join(experiment_list) ensemble_metrics['n_model'] = len(experiment_list) results = pd.DataFrame([ensemble_metrics]) # print scores print(results.head()) # save the ensemble results in a CSV file results.to_csv(path / 'ensemble_summary.csv', index=False)
def main(): """ This function will save the solar wind data as a Feather file """ # read the main config file config = load_data.read_config_file('./config/config.yml') # get the path to the CSV File directories = config['directories'] raw_path = Path(directories['raw']) interim_path = Path(directories['interim']) interim_path.mkdir(exist_ok=True, parents=True) logging.info('reading solar wind data..') # reading CSV file solar_wind = load_data.read_csv(raw_path / 'solar_wind.csv') logging.info('saving to feather..') # saving as feather file solar_wind.to_feather(interim_path / 'solar_wind.feather')
from load_data import read_csv path = "data/sample_submission.csv" if __name__ == "__main__": """ Main script to run analysis""" df = read_csv(path)
def main(use_sample: bool = False, n_jobs: int = 1): """ This function will apply all the steps in order to create a dataset ready to train models. The following steps: - read the data - compute the solar wind features - compute sattelite positions features - take the log of smoothed_ssn values - create the target for the actual time t and t + 1 hour - merge all dataset into a single one - save the dataset for future modeling # Params use_sample: `bool`, optional(defualt=False) Whether or not to use the sample dataset n_jobs: `in`, optional(defualt=1) The number of jobs to run in parallel """ logging.info(f'use_sample={use_sample}, n_jobs={n_jobs}') logging.info('reading config file') config = load_data.read_config_file('./config/config.yml') # directories directories = config['directories'] raw_path = Path(directories['raw']) interim_path = Path(directories['interim']) processed_path = Path(directories['processed']) processed_path.mkdir(exist_ok=True, parents=True) # reading gt data solar_wind_file = ('sample_solar_wind.feather' if use_sample else 'solar_wind.feather') logging.info('reading training data') dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv') solar_wind = load_data.read_feather(interim_path / solar_wind_file) sunspots = load_data.read_csv(raw_path / 'sunspots.csv') stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv') logging.info('preprocessing solar wing') # preprocessing solar wind # setting timedelta as index solar_wind.set_index('timedelta', inplace=True) # preprocessing solar wind time series solar_wind = solar_wind_preprocessing(solar_wind) logging.info('computing features') start = time.time() # computing solar wind features data = split_into_period(solar_wind, features=default.init_features, n_jobs=n_jobs) elapsed_time = (time.time() - start) / 60 logging.info(f'elapsed time {elapsed_time:.4f}') logging.info('merging other datasets') # create target target = create_target(dst_labels) # preprocessing sattelite positions stl_pos = stl_preprocessing(stl_pos) # taking the log of smoothed_ssn values sunspots['smoothed_ssn'] = np.log(sunspots['smoothed_ssn']) # merging dataframes to the main dataframe data = merge_daily(data, stl_pos) data = merge_daily(data, sunspots) # merging target dataframe to the main dataframe data = data.merge(target, how='left', on=['period', 'timedelta']) # droping last values where there is not available data data.dropna(subset=['t0', 't1'], inplace=True) # reset index data.reset_index(inplace=True, drop=True) logging.info('saving') output_filename = 'fe' if not use_sample else 'fe_sample' # saving to feather format data.to_feather(processed_path / f'{output_filename}.feather')
import import_path import numpy as np import pandas as pd from models.oracle import Oracle from models.surrogate_teacher import Surrogate from models.omniscient_teacher import Omniscient from models.random_teacher import Random from models.without_teacher import Without_teacher from utils import predict, predict_by_W, rmse_W, write_np2csv, rmse_w, make_random_mask, predict_wj from load_data import read_W, read_csv, split_data from tqdm import tqdm from sklearn.metrics import roc_auc_score import logging import datetime # %% df = read_csv('output/wine-quality-pm1.csv', header=0) train_X, test_X, train_y, test_y = split_data(df, True) eta, lambd, alpha = 1, 2, 0.01 training_epochs, loops = 10, 10 J = 10 # 提示する教材合計数 textbook = 500 # 推定に使う教材数 test_textbook_list = [100] # 推定間に提示する教材数 between_textbook_list = [1] # 組 k = 1 lambds = [1, 2, 3, 4, 5] for lambd in lambds:
import numpy as np import pandas as pd from models.oracle import Oracle from models.surrogate_teacher import Surrogate from models.omniscient_teacher import Omniscient from models.random_teacher import Random from models.without_teacher import Without_teacher from utils import predict, predict_by_W, rmse_W, write_np2csv, rmse_w, make_random_mask, predict_wj from load_data import read_W, read_csv, split_data from tqdm import tqdm from sklearn.metrics import roc_auc_score import logging import datetime # %% df = read_csv('output/weebil_vespula_pm1.csv', header=0) train_X, test_X, train_y, test_y = split_data(df, False) eta, lambd, alpha = 1, 2, 0.01 training_epochs, loops = 10, 10 J = 10 # 提示する教材合計数 textbook = 500 # 推定に使う教材数 test_textbook_list = [100] # 推定間に提示する教材数 between_textbook_list = [1] # 組 k = 1 lambds = [1, 2, 3, 4, 5] for lambd in lambds:
from load_data import read_csv from svd import SVD from sklearn.cluster import MiniBatchKMeans import pandas as pd if __name__ == '__main__': rec_data = read_csv('../input/user_item_cnt.csv') rec = SVD(rec_data) rec.fit() score = rec.get_score() tmp = [dict(v, user_id=user_id) for user_id, aaa in score.items() for v in aaa] df = pd.DataFrame(tmp) df.head() df.to_csv('svd2.csv', index=False) """ model = MiniBatchKMeans(n_clusters=100, random_state=0) model.fit(rec.user_matrix) pred = model.predict(rec.user_matrix) users = [rec_data.map_idx2user[i] for i in range(len(rec_data.map_idx2user))] max(users) len(rec_data.map_idx2user) df = pd.DataFrame({'user_id': users, 'cluster': pred}) df.to_csv('cluster.csv', index=False) """
# Optional check for unexpected values if not np.isfinite(prediction_at_t0): prediction_at_t0 = -12 if not np.isfinite(prediction_at_t1): prediction_at_t1 = -12 return prediction_at_t0, prediction_at_t1 if __name__ == '__main__': # We use this code for testing import load_data import time raw_path = Path('data/raw/') interim_path = Path('data/interim') dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv') solar_wind = load_data.read_feather(interim_path / 'solar_wind.feather') sunspots = load_data.read_csv(raw_path / 'sunspots.csv') stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv') date = pd.to_timedelta(7, unit='d') # date = pd.to_timedelta('111 days 04:00:00') one_minute = pd.to_timedelta("1 minute") seven_days = pd.to_timedelta("7 days") solar_wind = solar_wind[solar_wind['period'] == 'train_a'] sunspots = sunspots[sunspots['period'] == 'train_a'] stl_pos = stl_pos[stl_pos['period'] == 'train_a'] solar_wind.set_index(['timedelta'], inplace=True) stl_pos.set_index(['timedelta'], inplace=True) sunspots.set_index(['timedelta'], inplace=True) t_minus_7 = date - seven_days
c[j][0] += (math.pow(time[i], j) * price[i]) matrix_c = np.matrix(c) # Now Calc mean mean_matrix = matrix_a * matrix_S * matrix_c * beta mean = mean_matrix.item(0) # Now Calc 𝜑(x) d = [[0] for _ in range(M + 1)] for i in range(M + 1): d[i][0] = math.pow(test_val, i) matrix_d = np.matrix(d) # Now Calc variance variance = math.sqrt((matrix_a * matrix_S * matrix_d)[0][0] + (1 / beta)) # Now show results print("Predicted Val : {:.4f}".format(mean)) print("Actual Val : {:.4f}".format(price[-1])) print("Range Predction : [{:.4f}, {:.4f}]".format(mean - 3 * variance, mean + 3 * variance)) print("Absolute Error : {:.4f}".format(abs(price[-1] - mean))) print("Relative Error : {:.4f}%".format( abs(price[-1] - mean) / price[-1] * 100)) if __name__ == "__main__": csv_path = "./data/" files = load_data.find_csv(csv_path) for f in files: print(("-" * 5 + f.split(".")[0] + " Summary" + "-" * 20)[:40]) time, price = load_data.read_csv(csv_path, f) baysian_regression_predict(time, price, time[-1]) print("-" * 40, end="\n\n\n")