예제 #1
0
def main():
    """Ensemble all models inside the experiments folder"""
    # we assume all the experiments are saved
    # in the experiments folder
    path = Path('experiments')
    # get a list of all experiments name
    experiment_list = os.listdir(path)
    assert len(experiment_list) > 1, \
           'there is not enough experiments to ensemble'
    predictions = []
    # for every experiment
    for experiment in experiment_list:
        # create a path to the valid prediction file
        path_to_pred = path.joinpath(experiment, 'prediction', 'valid.csv')
        if not os.path.exists(path_to_pred):
            continue
        # if this file exists, we read it and
        # set the experiment column to the name of this experiment
        pred_exp = load_data.read_csv(path_to_pred)
        pred_exp = pred_exp.assign(experiment=experiment)
        predictions.append(pred_exp)
    # concat all the predictions
    predictions = pd.concat(predictions)
    # create the target by dropping all duplicates
    target = predictions.drop_duplicates(subset=['period', 'timedelta'])
    target.reset_index(drop=True, inplace=True)
    target.drop(columns=default.yhat, inplace=True)

    # ensemble
    predictions_ensemble = ensemble(predictions)

    target_ensemble = target.merge(predictions_ensemble,
                                   on=['period', 'timedelta'],
                                   how='left')
    # check there is non nan values
    assert target_ensemble[default.yhat].isna().sum().sum() == 0
    # compute the metrics
    ensemble_metrics = compute_metrics(target_ensemble)
    experiment_list = list(predictions['experiment'].unique())
    ensemble_metrics['experiment'] = '__'.join(experiment_list)
    ensemble_metrics['n_model'] = len(experiment_list)
    results = pd.DataFrame([ensemble_metrics])
    # print scores
    print(results.head())
    # save the ensemble results in a CSV file
    results.to_csv(path / 'ensemble_summary.csv', index=False)
예제 #2
0
def main():
    """
    This function will save the solar wind data
    as a Feather file
    """
    # read the main config file
    config = load_data.read_config_file('./config/config.yml')
    # get the path to the CSV File
    directories = config['directories']
    raw_path = Path(directories['raw'])
    interim_path = Path(directories['interim'])
    interim_path.mkdir(exist_ok=True, parents=True)
    logging.info('reading solar wind data..')
    # reading CSV file
    solar_wind = load_data.read_csv(raw_path / 'solar_wind.csv')
    logging.info('saving to feather..')
    # saving as feather file
    solar_wind.to_feather(interim_path / 'solar_wind.feather')
예제 #3
0
from load_data import read_csv

path = "data/sample_submission.csv"

if __name__ == "__main__":
    """ Main script to run analysis"""
    df = read_csv(path)
def main(use_sample: bool = False, n_jobs: int = 1):
    """
    This function will apply all the steps in order to create
    a dataset ready to train models.
    The following steps:
        - read the data
        - compute the solar wind features
        - compute sattelite positions features
        - take the log of smoothed_ssn values
        - create the target for the actual time t and t + 1 hour
        - merge all dataset into a single one
        - save the dataset for future modeling
    # Params
    use_sample: `bool`, optional(defualt=False)
        Whether or not to use the sample dataset
    n_jobs: `in`, optional(defualt=1)
        The number of jobs to run in parallel

    """
    logging.info(f'use_sample={use_sample}, n_jobs={n_jobs}')
    logging.info('reading config file')
    config = load_data.read_config_file('./config/config.yml')
    # directories
    directories = config['directories']
    raw_path = Path(directories['raw'])
    interim_path = Path(directories['interim'])
    processed_path = Path(directories['processed'])
    processed_path.mkdir(exist_ok=True, parents=True)

    # reading gt data
    solar_wind_file = ('sample_solar_wind.feather'
                       if use_sample else 'solar_wind.feather')
    logging.info('reading training data')
    dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv')
    solar_wind = load_data.read_feather(interim_path / solar_wind_file)
    sunspots = load_data.read_csv(raw_path / 'sunspots.csv')
    stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv')

    logging.info('preprocessing solar wing')
    # preprocessing solar wind
    # setting timedelta as index
    solar_wind.set_index('timedelta', inplace=True)
    # preprocessing solar wind time series
    solar_wind = solar_wind_preprocessing(solar_wind)
    logging.info('computing features')
    start = time.time()
    # computing solar wind features
    data = split_into_period(solar_wind,
                             features=default.init_features,
                             n_jobs=n_jobs)
    elapsed_time = (time.time() - start) / 60
    logging.info(f'elapsed time {elapsed_time:.4f}')

    logging.info('merging other datasets')
    # create target
    target = create_target(dst_labels)
    # preprocessing sattelite positions
    stl_pos = stl_preprocessing(stl_pos)
    # taking the log of smoothed_ssn values
    sunspots['smoothed_ssn'] = np.log(sunspots['smoothed_ssn'])
    # merging dataframes to the main dataframe
    data = merge_daily(data, stl_pos)
    data = merge_daily(data, sunspots)
    # merging target dataframe to the main dataframe
    data = data.merge(target, how='left', on=['period', 'timedelta'])
    # droping last values where there is not available data
    data.dropna(subset=['t0', 't1'], inplace=True)
    # reset index
    data.reset_index(inplace=True, drop=True)
    logging.info('saving')
    output_filename = 'fe' if not use_sample else 'fe_sample'
    # saving to feather format
    data.to_feather(processed_path / f'{output_filename}.feather')
예제 #5
0
import import_path
import numpy as np
import pandas as pd
from models.oracle import Oracle
from models.surrogate_teacher import Surrogate
from models.omniscient_teacher import Omniscient
from models.random_teacher import Random
from models.without_teacher import Without_teacher
from utils import predict, predict_by_W, rmse_W, write_np2csv, rmse_w, make_random_mask, predict_wj
from load_data import read_W, read_csv, split_data
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import logging
import datetime
# %%
df = read_csv('output/wine-quality-pm1.csv', header=0)
train_X, test_X, train_y, test_y = split_data(df, True)
eta, lambd, alpha = 1, 2, 0.01
training_epochs, loops = 10, 10
J = 10
# 提示する教材合計数
textbook = 500
# 推定に使う教材数
test_textbook_list = [100]
# 推定間に提示する教材数
between_textbook_list = [1]
# 組
k = 1
lambds = [1, 2, 3, 4, 5]

for lambd in lambds:
예제 #6
0
import numpy as np
import pandas as pd
from models.oracle import Oracle
from models.surrogate_teacher import Surrogate
from models.omniscient_teacher import Omniscient
from models.random_teacher import Random
from models.without_teacher import Without_teacher
from utils import predict, predict_by_W, rmse_W, write_np2csv, rmse_w, make_random_mask, predict_wj
from load_data import read_W, read_csv, split_data
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import logging
import datetime

# %%
df = read_csv('output/weebil_vespula_pm1.csv', header=0)
train_X, test_X, train_y, test_y = split_data(df, False)
eta, lambd, alpha = 1, 2, 0.01
training_epochs, loops = 10, 10
J = 10
# 提示する教材合計数
textbook = 500
# 推定に使う教材数
test_textbook_list = [100]
# 推定間に提示する教材数
between_textbook_list = [1]
# 組
k = 1

lambds = [1, 2, 3, 4, 5]
for lambd in lambds:
예제 #7
0
from load_data import read_csv
from svd import SVD
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
if __name__ == '__main__':
    rec_data = read_csv('../input/user_item_cnt.csv')
    rec = SVD(rec_data)
    rec.fit()

    score = rec.get_score()
    tmp = [dict(v, user_id=user_id) for user_id, aaa in score.items() for v in aaa]
    df = pd.DataFrame(tmp)
    df.head()
    df.to_csv('svd2.csv', index=False)
    """
    model = MiniBatchKMeans(n_clusters=100, random_state=0)
    model.fit(rec.user_matrix)
    pred = model.predict(rec.user_matrix)

    users = [rec_data.map_idx2user[i] for i in range(len(rec_data.map_idx2user))]
    max(users)
    len(rec_data.map_idx2user)

    df = pd.DataFrame({'user_id': users, 'cluster': pred})
    df.to_csv('cluster.csv', index=False)
    """
    # Optional check for unexpected values
    if not np.isfinite(prediction_at_t0):
        prediction_at_t0 = -12
    if not np.isfinite(prediction_at_t1):
        prediction_at_t1 = -12

    return prediction_at_t0, prediction_at_t1


if __name__ == '__main__':
    # We use this code for testing
    import load_data
    import time
    raw_path = Path('data/raw/')
    interim_path = Path('data/interim')
    dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv')
    solar_wind = load_data.read_feather(interim_path / 'solar_wind.feather')
    sunspots = load_data.read_csv(raw_path / 'sunspots.csv')
    stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv')

    date = pd.to_timedelta(7, unit='d')
    # date = pd.to_timedelta('111 days 04:00:00')
    one_minute = pd.to_timedelta("1 minute")
    seven_days = pd.to_timedelta("7 days")
    solar_wind = solar_wind[solar_wind['period'] == 'train_a']
    sunspots = sunspots[sunspots['period'] == 'train_a']
    stl_pos = stl_pos[stl_pos['period'] == 'train_a']
    solar_wind.set_index(['timedelta'], inplace=True)
    stl_pos.set_index(['timedelta'], inplace=True)
    sunspots.set_index(['timedelta'], inplace=True)
    t_minus_7 = date - seven_days
예제 #9
0
            c[j][0] += (math.pow(time[i], j) * price[i])
    matrix_c = np.matrix(c)
    # Now Calc mean
    mean_matrix = matrix_a * matrix_S * matrix_c * beta
    mean = mean_matrix.item(0)
    # Now Calc 𝜑(x)
    d = [[0] for _ in range(M + 1)]
    for i in range(M + 1):
        d[i][0] = math.pow(test_val, i)
    matrix_d = np.matrix(d)
    # Now Calc variance
    variance = math.sqrt((matrix_a * matrix_S * matrix_d)[0][0] + (1 / beta))
    # Now show results
    print("Predicted Val   : {:.4f}".format(mean))
    print("Actual Val      : {:.4f}".format(price[-1]))
    print("Range Predction : [{:.4f}, {:.4f}]".format(mean - 3 * variance,
                                                      mean + 3 * variance))
    print("Absolute Error  : {:.4f}".format(abs(price[-1] - mean)))
    print("Relative Error  : {:.4f}%".format(
        abs(price[-1] - mean) / price[-1] * 100))


if __name__ == "__main__":
    csv_path = "./data/"
    files = load_data.find_csv(csv_path)
    for f in files:
        print(("-" * 5 + f.split(".")[0] + " Summary" + "-" * 20)[:40])
        time, price = load_data.read_csv(csv_path, f)
        baysian_regression_predict(time, price, time[-1])
        print("-" * 40, end="\n\n\n")