def test_pickle():
    tmpdir = tempfile.mkdtemp(prefix="pickle")
    a = lrange(10)

    # test with str
    path_str = tmpdir + "/res.pkl"
    save_pickle(a, path_str)
    b = load_pickle(path_str)
    assert_equal(a, b)

    # test with pathlib
    path_pathlib = pathlib.Path(tmpdir) / "res2.pkl"
    save_pickle(a, path_pathlib)
    c = load_pickle(path_pathlib)
    assert_equal(a, c)

    # cleanup, tested on Windows
    try:
        import os

        os.remove(path_str)
        os.remove(path_pathlib)
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    # test with file handle
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0, 0)
    d = load_pickle(fh)
    fh.close()
    assert_equal(a, d)
示例#2
0
def test_pickle():
    import tempfile
    from numpy.testing import assert_equal
    tmpdir = tempfile.mkdtemp(prefix='pickle')
    a = range(10)
    save_pickle(a, tmpdir+'/res.pkl')
    b = load_pickle(tmpdir+'/res.pkl')
    assert_equal(a, b)

    #cleanup, tested on Windows
    try:
        import os
        os.remove(tmpdir+'/res.pkl')
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    #test with file handle
    from statsmodels.compatnp.py3k import BytesIO
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0,0)
    c = load_pickle(fh)
    fh.close()
    assert_equal(a,b)
示例#3
0
def test_pickle():
    import tempfile
    from numpy.testing import assert_equal
    tmpdir = tempfile.mkdtemp(prefix='pickle')
    a = lrange(10)
    save_pickle(a, tmpdir+'/res.pkl')
    b = load_pickle(tmpdir+'/res.pkl')
    assert_equal(a, b)

    #cleanup, tested on Windows
    try:
        import os
        os.remove(tmpdir+'/res.pkl')
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    #test with file handle
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0,0)
    c = load_pickle(fh)
    fh.close()
    assert_equal(a,b)
示例#4
0
def main():
    """
    特征评分卡制作主程序入口
    """
    odds = config.get('SCORECARD', 'ODDS')
    score = config.get('SCORECARD', 'SCORE')
    pdo = config.get('SCORECARD', 'PDO')
    feature_engineering = load_pickle("result/feature_engineering.pickle")
    woe_result = feature_engineering["woe_result"]
    model = load_pickle("result/lr.pickle")
    coefficient = list(
        zip(feature_engineering["feature_selected"], list(model.coef_[0])))
    coefficient.append(("intercept_", model.intercept_[0]))
    coefs = dict(coefficient)
    make_card(coefs, woe_result, odds, score, pdo)
示例#5
0
def run_forcast(ts, config):
    """This function does the forcasting and return a pandas df with all the historical value
    and predicted one. We have 2 differents mode according to our usage.
    If you need to refit the model, add clean entry to True to the config dict

    Args:
        ts (pandas data frame): ts means time series. There is no a strong 
                                hypthesis on the data frame except an exictence 
                                of a minumum of 2 columns ds (datetime) and y (numeric)
        periods (int):          used to define the future period for forecasting. According 
                                to the freq args, it's interpreted as a daily,monthly or yearly periods
        freq (string):          frequency in the time series. Could take 3 values : 'D' for daily,
                                'M' for monthly, 'Y' for yearly

        config (dictionary):    config is a dictionnary with a minimum of 3 entries. 
                                Could have some model dependant entries or optionnal technical entries
            model (string): the model name
            date (datetime): the date of the training as a time staamp with millis
            id_model (string): model indentifier to make the model unique
            prediction_conf (dictionary):
                future_period (int): number of prediction to perform
                freq (string): 'D' for daily, 'M' for monthly, 'Y' for yearly
            tech_conf (dictionary) [optionnal]:
                clean (bool): clean all existing model and refit. This option remove all exixting model.
                              need improvement to perform an ad hoc clean


    Returns:
       forcasted result (pandas data frame):    This function return some new columns in the data frame.
                                                yhat predicted values, yhat_lower and yhat_upper for uncertainty,
                                                trend, seasonnability...to complete

    Raises:
       XXXXXX

    See usage.py for a demo."""

    #Try to get the model
    model_ref_prefix = tb.compute_model_id_hash(config)

    model_ref_name = tb.get_model_ref_name(model_ref_prefix,
                                           tempfile.gettempdir())
    clean = 'tech_conf' in config and 'clean' in config[
        'tech_conf'] and config['tech_conf']['clean']

    if model_ref_name is None or clean:
        logger.info(
            "Fitted model doesn't exist or not up to date. We will create one")
        ref_name, m_ref = train_for_forcasting(ts, config)
    else:
        logger.info("Get the stored model")
        m_ref = pick.load_pickle(
            os.path.join(tempfile.gettempdir(), model_ref_name))

    period = config['prediction_conf']['future_period']
    freq = config['prediction_conf']['freq']
    future = tb.make_future(ts, period, freq)
    forcasted_result = dtt.back_to_origin(m_ref.predict(future))

    return forcasted_result
def predicted_sneaker_resale(form_dict):

    repo_path = Path(os.getcwd())
    lm = load_pickle(repo_path / "prod-models" / "resale_predictor.pickle")
    try:
        df = pd.DataFrame(form_dict, index=[0])
        df['retail'] = df['retail'].astype(int)
        df['log_retail'] = np.log(df['retail'])
        df['date'] = pd.to_datetime(df['date'])
        df['release_month'] = df['date'].dt.month
        df['release_dow'] = df['date'].dt.weekday
        month = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
        dow = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
        df['release_month'] = df['release_month'].map(month)
        df['release_dow'] = df['release_dow'].map(dow)

        wmns = {'Male':0, 'Female':1}
        df['wmns'] = df['wmns'].map(wmns)

        bools = {'No':0, 'Yes':1}
        df['collab'] = df['collab'].map(bools)
        df['retro'] = df['retro'].map(bools)
        df['kids'] = df['kids'].map(bools)

        sname = form_dict['name']
        pred_resale_price = round(np.exp(lm.predict(df)[0]))
        retail = int(form_dict['retail'])
        diff = round(pred_resale_price - retail)
        pp = round(((pred_resale_price - retail) / pred_resale_price)*100, 2)

        return "The {0} is expected to resell for ${1}! This is a projected {2} dollar difference from the original ${4} retail price and an {3}% price premium.".format(sname, pred_resale_price, diff, pp, retail)

    except:
        return 'Sorry, there was a problem processing the data entered... Please go back and double check your entries, thanks!'
示例#7
0
def test_pickle_supports_open():
    tmpdir = tempfile.mkdtemp(prefix="pickle")
    a = lrange(10)

    class SubPath:
        def __init__(self, path):
            self._path = pathlib.Path(path)

        def open(
            self,
            mode="r",
            buffering=-1,
            encoding=None,
            errors=None,
            newline=None,
        ):
            return self._path.open(
                mode=mode,
                buffering=buffering,
                encoding=encoding,
                errors=errors,
                newline=newline,
            )

    # test with pathlib
    path_pathlib = SubPath(tmpdir + os.pathsep + "res2.pkl")
    save_pickle(a, path_pathlib)
    c = load_pickle(path_pathlib)
    assert_equal(a, c)
示例#8
0
def load_model():
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        "MLR_model_v2.pickle")
    from statsmodels.iolib.smpickle import load_pickle

    model = load_pickle(path)
    return model
示例#9
0
def processed_feature(mode, read_local=True, do_transform=True, woe=True):
    """
    模型训练主程序入口
    """
    assert os.path.exists('result/feature_engineering.pickle') if do_transform else True

    data = load_data(mode=mode, read_local=read_local)
    numeric_var, category_var, datetime_var, y_var, identifier_var, text_var = get_variable_type()
    feature = data[numeric_var + category_var + datetime_var + text_var]
    y = data[y_var] if mode == "train" else pd.Series([np.nan]*len(feature), name=y_var)

    # 自定义数据清洗
    cs = Custom()
    feature, y = cs.clean_data(feature, y)

    # 数据类型转换
    feature[numeric_var] = feature[numeric_var].astype('float')
    feature[category_var] = feature[category_var].astype('category')
    feature[datetime_var] = feature[datetime_var].astype('datetime64[ns]')

    # 自定义特征组合,全部为数值变量
    feature = cs.feature_combination(feature, y)

    # 数据处理
    if do_transform:
        feature_engineering = load_pickle('result/feature_engineering.pickle')
        feature = feature[feature_engineering['feature_selected']]
        feature = transform(feature, feature_engineering, woe=woe)
    return feature, y
示例#10
0
def forecast(dataDir):
    #Load the model
    results_ARIMA = load_pickle(dataDir + 'model/results_ARIMA.pickle')

    #Forecast next 15 days's price
    dt = datetime.datetime.now()
    base = datetime.date(dt.year, dt.month, dt.day)
    numdays = 15
    dates = [pd.Timestamp(base + datetime.timedelta(days=x)) for x in range(1, numdays+1)]
    forecast = pd.Series(results_ARIMA.forecast(steps=15)[0],dates)
    forecast = np.exp(forecast)
    print(forecast)
    forecast.to_csv(dataDir + 'forecast/forecast.csv',index_label='time',header=['price'])
示例#11
0
def main(mode,
         model_path,
         do_transform,
         read_local,
         woe,
         score=False,
         save_remote=False):
    # 数据、模型加载
    model, threshold = load_pickle(model_path)

    data = load_data(mode=mode, read_local=read_local)
    feature, y = processed_feature(do_transform=do_transform,
                                   mode=mode,
                                   read_local=read_local,
                                   woe=woe)

    cs = Custom()
    # 应用预测
    print(">>> 应用预测")
    res_label = pd.DataFrame(model.predict(feature), columns=['label_predict'])
    res_prob = pd.DataFrame(model.predict_proba(feature),
                            columns=['probability_0', "probability_1"])
    res_prob[
        'res_odds'] = res_prob['probability_0'] / res_prob["probability_1"]
    res_prob['label_threshold'] = res_prob['probability_1'].apply(
        lambda x: 0 if x < threshold else 1)
    res = pd.concat([data, res_label, res_prob], axis=1)

    if score:
        print(">>> 概率转换评分")
        odds = config.get('SCORECARD', 'odds')
        score = config.get('SCORECARD', 'score')
        pdo = config.get('SCORECARD', 'pdo')
        a, b = make_score(odds, score, pdo)
        res['score'] = res_prob['res_odds'].apply(
            lambda x: a + b * log(float(x)))
        bins = tree_binning(res[y.name], res['score'].to_frame(
        ))[0]["result"]["score"] if mode == "train" else cs.adjust_bins
        if bins:
            print(">>> 数据集分组")
            res['level'] = pd.cut(res['score'], bins)
            temp = res.groupby("level", as_index=False).count()
            temp['rate'] = temp['label_threshold'] / feature.shape[0]
            temp = temp[['level', 'rate']]
            print(temp)
            print(res.head())

    # 结果保存
    print(f">>> 结果保存中,保存模式:{save_remote}")
    res['load_date'] = str(date.today())
    save_result(res, filename=f"{mode}_result.csv", remote=save_remote)
示例#12
0
def test_pickle():
    tmpdir = tempfile.mkdtemp(prefix='pickle')
    a = lrange(10)
    save_pickle(a, tmpdir + '/res.pkl')
    b = load_pickle(tmpdir + '/res.pkl')
    assert_equal(a, b)

    # cleanup, tested on Windows
    try:
        import os
        os.remove(tmpdir + '/res.pkl')
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    # test with file handle
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0, 0)
    c = load_pickle(fh)
    fh.close()
    assert_equal(a, c)
示例#13
0
    def load(cls, fname):
        '''
        load a pickle, (class method)
        Parameters
        ----------
        fname : string or filehandle
            fname can be a string to a file path or filename, or a filehandle.
        Returns
        -------
        unpickled instance
        '''

        from statsmodels.iolib.smpickle import load_pickle
        return load_pickle(fname)
def get_predictions(station_number, number_of_predictions):
    base_dir = os.getcwd() + '/data/'
    try:
        model = load_pickle(base_dir + 'models/' + str(station_number) +
                            '.pkl')
    except FileNotFoundError as e:
        print("Model not found. Please run this script atleast once.")
        return None
    yhat = model.forecast(model.y, steps=number_of_predictions)
    max_vals = []
    columns = []
    try:
        max_val_file_content = list(
            csv.reader(
                open(base_dir + "csv/max_val_dump(do not delete).csv", 'r')))
    except FileNotFoundError as e:
        print(
            "CSV file for maximum values dumped not found. Please run the script to create it."
        )
        return None
    try:
        columns_file_content = list(
            csv.reader(open(base_dir + "csv/cols_dump(do not delete).csv",
                            'r')))
    except FileNotFoundError as e:
        print(
            "CSV file for columns dumped not found. Please run the script to create it."
        )
        return None
    for line in max_val_file_content:
        if str(line[0]) == str(station_number)[:-1]:
            max_vals = list(map(float, line[1:]))
            break
    yhat = yhat * [max_vals]
    for line in columns_file_content:
        if str(line[0]) == str(station_number)[:-1]:
            columns = line[1:]
            break
    cols_to_return = {}
    for key in COLUMNS:
        if key in columns:
            cols_to_return[key] = []
            index = columns.index(key)
            for row in yhat:
                cols_to_return[key].append('{:.2f}'.format(row[index]))
        else:
            cols_to_return[key] = None
    return cols_to_return
示例#15
0
    def load(cls, fname):
        """
        Load a pickled results instance.

        Parameters
        ----------
        fname : {str, handle}
            A string filename or a file handle.

        Returns
        -------
        Results
            The unpickled results instance.
        """
        from statsmodels.iolib.smpickle import load_pickle
        return load_pickle(fname)
示例#16
0
def get_prediction(num_arete, hour, minute=0):
    m, s = False, False
    if (hour <= 9 and hour >= 7):
        m = True
        hour = hour - 7
    elif (hour <= 19 and hour >= 17):
        s = True
        hour = hour - 17
    creneau = 0 if m else 1
    res = load_pickle("../data/Regression/vehicules_aretes_" + str(creneau) +
                      "_n" + str(num_arete) + ".pickle")

    #print(res.summary())
    #print(res.params)
    pred = res.predict(exog=[hour, minute])

    y_pred = res.params[0] + res.params[1] * (hour * 60 + minute)

    return y_pred
def predict_margins(nfl):

    margin_res = load_pickle("models/margin_res.pickle")

    margin_ari_score = 0
    margin_ari_opp = 0
    margins = []

    for key, row in nfl.iterrows():
        if row.team == "ARI":
            team_coeff = margin_ari_score
        else:
            res_team = "team[T." + row.team + "]"
            team_coeff = margin_res.params[res_team]
        if row.opp == "ARI":
            opp_coeff = margin_ari_opp
        else:
            res_opp = "team[T." + row.opp + "]"
            opp_coeff = margin_res.params[res_opp]
        if row.ha == "away":
            ha_coeff = margin_res.params["ha[T.home]"]*-1
        else:
            ha_coeff = margin_res.params["ha[T.home]"]*1

        margin_predict = margin_res.params.Intercept + margin_res.params.third_per*row['third_per'] + \
            margin_res.params.third_per_allowed*row['third_per_allowed'] + margin_res.params.TOP*row['TOP'] + \
            margin_res.params.first_downs * row['first_downs'] + margin_res.params.first_downs_allowed * \
            row['first_downs_allowed'] + margin_res.params.pass_yards*row['pass_yards'] + \
            margin_res.params.pass_yards_allowed*row['pass_yards_allowed'] + margin_res.params.penalty_yards * \
            row['penalty_yards'] + margin_res.params.plays*row['plays'] + margin_res.params.rush_yards * \
            row['rush_yards'] + margin_res.params.rush_yards_allowed*row['rush_yards_allowed'] + \
            margin_res.params.sacked*row['sacked'] + margin_res.params.sacks*row['sacks'] + \
            margin_res.params.takeaways*row['takeaways'] + margin_res.params.total_yards*row['total_yards'] + \
            margin_res.params.total_yards_allowed*row['total_yards_allowed'] + margin_res.params.turnovers * \
            row['turnovers'] + ha_coeff + team_coeff + opp_coeff

        margins.append(margin_predict)

    away_margin = margins[0] + margins[1]
    home_margin = -1*away_margin
    pred_margins = [home_margin, away_margin]

    return pred_margins
def predict_totals(nfl):

    total_res = load_pickle("models/total_res.pickle")

    total_ari_score = 0
    total_ari_opp = 0
    totals = []

    for key, row in nfl.iterrows():
        if row.team == "ARI":
            team_coeff = total_ari_score
        else:
            res_team = "team[T." + row.team + "]"
            team_coeff = total_res.params[res_team]
        if row.opp == "ARI":
            opp_coeff = total_ari_opp
        else:
            res_opp = "team[T." + row.opp + "]"
            opp_coeff = total_res.params[res_opp]
        if row.ha == "away":
            ha_coeff = total_res.params["ha[T.home]"]*-1
        else:
            ha_coeff = total_res.params["ha[T.home]"]*1

        total_predict = total_res.params.Intercept + total_res.params.third_per*row['third_per'] + \
            total_res.params.third_per_allowed*row['third_per_allowed'] + total_res.params.TOP*row['TOP'] + \
            total_res.params.first_downs*row['first_downs'] + total_res.params.first_downs_allowed*row['first_downs_allowed'] + \
            total_res.params.pass_yards*row['pass_yards'] + total_res.params.pass_yards_allowed*row['pass_yards_allowed'] + \
            total_res.params.penalty_yards*row['penalty_yards'] + total_res.params.plays*row['plays'] + \
            total_res.params.rush_yards*row['rush_yards'] + total_res.params.rush_yards_allowed*row['rush_yards_allowed'] + \
            total_res.params.sacked*row['sacked'] + total_res.params.sacks*row['sacks'] + total_res.params.takeaways * \
            row['takeaways'] + total_res.params.total_yards*row['total_yards'] + total_res.params.total_yards_allowed * \
            row['total_yards_allowed'] + total_res.params.turnovers * \
            row['turnovers'] + ha_coeff + team_coeff + opp_coeff

        totals.append(total_predict)

    total_predicted = (totals[0] + totals[1])/2
    pred_totals = [total_predicted, total_predicted]

    return totals, pred_totals
    def load(cls, fname):
        """
        Load a pickled results instance

        .. warning::

           Loading pickled models is not secure against erroneous or
           maliciously constructed data. Never unpickle data received from
           an untrusted or unauthenticated source.

        Parameters
        ----------
        fname : {str, handle}
            A string filename or a file handle.

        Returns
        -------
        Results
            The unpickled results instance.
        """
        from statsmodels.iolib.smpickle import load_pickle
        return load_pickle(fname)
def load(dataset, mode_type='regression'):
    if mode_type not in ['knn', 'ada']:
        fname = dataset + '.pickle'
        from statsmodels.iolib.smpickle import load_pickle
        if os.path.isfile(fname):
            return load_pickle(fname)
        else:
            print("Pickled file " + fname + " not found!")
            print("Constructing model...")
            model = construct(dataset)
            print("Serializing...")
            model.save(fname)
            return model
    else:
        fname = dataset + '_{0}.pickle'.format(mode_type)
    if os.path.isfile(fname):
        return joblib.load(fname)
    else:
        print("Pickled file " + fname + " not found!")
        print("Constructing model...")
        model = scikit_construct(dataset, mode_type)
        print("Serializing...")
        joblib.dump(model, fname)
        return model
示例#21
0
 def load(cls, fname):
     from statsmodels.iolib.smpickle import load_pickle
     return load_pickle(fname)
示例#22
0
 def load(cls, fname):
     from statsmodels.iolib.smpickle import load_pickle
     return load_pickle(fname)
示例#23
0
# 如果指定到未来的时期,将会进行滚动预测
pred = results.predict(start='2015-02-01', end='2015-02-10')
print(pred)

# 2)进行滚动预测,预测未来几个值
pred = results.forecast(5)
print(pred)
# 由于采用滚动预测,越远越不准确

# 3)保存模型
fname = 'out.pkl'
results.save(fname)

# 4)加载模型
from statsmodels.iolib.smpickle import load_pickle
results = load_pickle(fname)

# 5)应用模型
print(results.params)
pred = results.forecast(3)
print(pred)

######################################################################
########  Part2、二次指数平滑 Double Exponential Smoothing
######################################################################
# Brown's Linear Exponential Smoothing


def double_exponential_smoothing(ts, alpha=0.8, isPlot=True):
    """
    布朗线性趋势模型(二次指数平滑)
示例#24
0
 def __init__(self):
     self.model_filename = os.path.join(MODELS_BASE_DIR,
                                        'final_var_model.pkl')
     self.model = load_pickle(self.model_filename)
     LOGGER.info('VAR Model: ' + self.model_filename.split('/')[-1] +
                 ' is loaded')
示例#25
0
文件: app.py 项目: yvak90/WC_AT
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template
import pickle

app = Flask(__name__)
from statsmodels.iolib.smpickle import load_pickle
model = load_pickle("slr_wcat.pkl")


@app.route('/')
def home():
    return render_template('startup.html')


@app.route('/predict', methods=['POST'])
def predict():
    '''
    For rendering results on HTML GUI
    '''
    float_features = [float(x) for x in request.form.values()]
    waist = float_features[0]
    waist_sq = waist * waist
    waist_cb = waist * waist * waist
    wcat = pd.DataFrame([[waist, waist_sq, waist_cb]],
                        columns=["Waist", "Waist_sq", "Waist_cb"])
    x = np.exp(model.predict(wcat))
    print(float(round(x, 2)))
    #    flt_features = [float(x) for x in request.form.values()]
    ##    int_features = [int(x) for x in request.form.values()]
    #    final_features = [np.array(int_features)]
示例#26
0
                        help='Directory of Pandas DataFrame')
    parser.add_argument('--model',
                        type=str,
                        default='./models/baseline_regression_linear.pickle',
                        help='Directory of Model')

    FLAGS, unparsed = parser.parse_known_args()

    # Loading the data
    data = pd.read_csv(FLAGS.data)

    # Preprocess the data
    data = preprocess_data(data)

    # Load model
    model = load_pickle(FLAGS.model)

    # Predictions
    predictions = model.get_prediction(exog=data)
    data['final_grade_predicted'] = predictions.predicted_mean
    print('\nData:\n')
    print(data)
    print()
    print(
        f"Average increase: "
        f"{100*np.mean((data['final_grade_predicted']/data['final_grade'])-1):.3f}%"
    )
    print()

    # Metrics
    print('\nEvaluation Metrics:')
def import_models(i):
    model = load_pickle(r"{}\models\model{}.pickle".format(path, i))

    return model
import copy

import pandas_tutorials as pd
from statsmodels.iolib.smpickle import load_pickle

data = pd.read_csv("../data/lr-binary.csv")
combos = copy.deepcopy(data)

print(combos.head())
# 数据中的列要跟预测时用到的列一致
predict_cols = combos.columns[1:]

# 预测集也要添加intercept变量
combos['intercept'] = 1.0

model = load_pickle("lr_admit.model")
print(model.summary())
# 进行预测,并将预测评分存入 predict 列中
print(model.predict(combos[predict_cols]))

# 预测完成后,predict 的值是介于 [0, 1] 间的概率值
# 我们可以根据需要,提取预测结果
# 例如,假定 predict > 0.5,则表示会被录取
# 在这边我们检验一下上述选取结果的精确度
total = 0
hit = 0
for value in combos.values:
    # 预测分数 predict, 是数据中的最后一列
    predict = value[-1]
    # 实际录取结果
    admit = int(value[0])
        for path in [cell_paths_sorted[0]]:  # Only need the first cell, which is the Ellsworth mouth/outlet
            cell_result = pd.read_csv(results_dir / path)
            jday_pad = cell_result['Jday'].apply(lambda x: str(x).zfill(3))
            str_year = cell_result['Year'].apply(lambda x: str(x))
            cell_result['date'] = str_year + jday_pad
            rng = pd.to_datetime(cell_result['date'], format='%Y%j')
            cell_result.index = rng
            cell_results_scenario.append(cell_result)

    cell_results.append(cell_results_scenario)

# =======================================================================
# Correct VELMA stream temperature seasonal bias using pre-trained regression model
# *** Not sure if this correction is still valid considering the non-linear seasonal changes of the climate projections ***

olsmodel = load_pickle(config.data_path.parents[0] / 'models' / 'stream_temp_correction_ols.pickle')

stream_temps_corrected = []
for i, scenario in enumerate(scenarios):
    stream_temps_scenario = []
    for j, gcm in enumerate(gcms):
        z = cell_results[i][j]['Water_Surface_Temperature(degrees_C)']

        day = 24 * 60 * 60
        year = 365.2425 * day
        timestamp_secs = pd.to_datetime(z.index)
        timestamp_secs = timestamp_secs.map(datetime.datetime.timestamp)
        year_cos = np.cos(timestamp_secs * (2 * np.pi / year))
        year_sin = np.sin(timestamp_secs * (2 * np.pi / year))

        y = pd.DataFrame(data=np.column_stack([z, year_cos, year_sin]), columns=['temp', 'year_cos', 'year_sin'])
示例#30
0
    def run_models():
        #-------------------------------------Creating and storing MLP model-----------------------------------------------------
        # Importing the dataset and separating dependent/independent variables

        dataset = pd.read_csv("assets/predicts.csv")

        print(dataset.dtypes)

        dataset['Main purpose of visit'].value_counts()
        dataset['Accessibility status'].value_counts()
        dataset['Accomodation status'].value_counts()
        dataset['health services status'].value_counts()

        cleanup_nums = {
            "Accessibility status": {
                "Poor": 1,
                "Fair": 2,
                "Good": 3,
                "Better": 4
            },
            "Accomodation status": {
                "Poor": 1,
                "Fair": 2,
                "Good": 3,
                "Better": 4
            },
            "health services status": {
                "Poor": 1,
                "Fair": 2,
                "Good": 3,
                "Better": 4
            },
        }
        dataset.replace(cleanup_nums, inplace=True)
        dataset.head(5)

        print(dataset.head(5))
        X = dataset.iloc[:, 1:8].values
        print(X[:, 3])

        y = dataset.iloc[:, 10].values
        print(y)
        # Encoding categorical data

        labelencoder_X_3 = LabelEncoder()
        X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3])

        list(labelencoder_X_3.inverse_transform([0, 1, 2, 3]))

        X[:, 3]
        X[:, 0:4]
        print(X)

        onehotencoder = OneHotEncoder(categorical_features=[3])
        X = onehotencoder.fit_transform(X).toarray()

        X = X[:, 1:]

        print('\n'.join(
            [''.join(['{:9}'.format(item) for item in row]) for row in X]))

        # Splitting the dataset into the Training set and Test set
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)

        a = y_test
        b = y_train

        # Feature Scaling //escaping

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        # Part 2 - making the the ANN model

        # Importing the Keras libraries and packages

        # Initialising the ANN for regression

        #Creating regression model
        REG = Sequential()

        # Adding the input layer and the first hidden layer with dropout if required
        REG.add(
            Dense(units=20,
                  input_dim=9,
                  kernel_initializer="normal",
                  activation='relu'))
        #REG.add(Dropout(p=0.1))
        # Adding the second hidden layer
        REG.add(Dense(units=20, kernel_initializer="normal",
                      activation='relu'))
        #REG.add(Dropout(p=0.1))
        # Adding the output layer
        REG.add(Dense(units=1, kernel_initializer="normal"))

        # Compiling the ANN
        #def root_mean_squared_error(y_true, y_pred):
        #        return K.sqrt(K.mean(K.square(y_pred - y_true)))

        REG.compile(optimizer='adam', loss='mean_squared_error')

        # Fitting the ANN to the Training set
        REG.fit(X_train, y_train, batch_size=10, epochs=200)

        # Part 3 - Making the predictions and evaluating the model
        X_test

        # Predicting the Test set results
        y_pred = REG.predict(X_test)

        REG.save('assets/REG_MLP_model.h5')
        K.clear_session()
        #---------------------------------------------------------------------------------------------------------------------

        #---------------------------------------Creating and storing SARIMA model----------------------------------------------
        #data collecting...converting dataset to html....
        df = pd.read_csv('assets/Touristarrival_monthly.csv')
        df1 = df.iloc[:5]
        html_table_template = df1.to_html(index=False)
        html_table = df.to_html(index=False)
        #data observation and log transformation
        df.index = pd.to_datetime(df['Month'])
        df['#Tourists'].plot()
        mpl.pyplot.ylabel("No.of Toursits Arrivals ")
        mpl.pyplot.xlabel("Year")

        #storing plots
        mpl.pyplot.savefig('PredictionEngine/static/img/sarima_input.png',
                           dpi=600,
                           bbox_inches='tight')
        mpl.pyplot.clf()

        series = df['#Tourists']
        logtransformed = np.log(series)
        logtransformed.plot()
        mpl.pyplot.ylabel("log Scale(No.of Toursits Arrivals) ")
        mpl.pyplot.xlabel("Year")

        #storing plots
        mpl.pyplot.savefig(
            'PredictionEngine/static/img/sarima_input_logscaled.png',
            dpi=600,
            bbox_inches='tight')
        mpl.pyplot.clf()

        #Train test split
        percent_training = 0.80
        split_point = round(len(series) * percent_training)
        print(split_point)
        training, testing = series[0:split_point], series[split_point:]
        training = np.log(training)

        #differencing to achieve stationarity
        training_diff = training.diff(periods=1).values[1:]

        #plot of residual log differenced series
        mpl.pyplot.plot(training_diff)
        mpl.pyplot.title("Tourist arrivals data log-differenced")
        mpl.pyplot.xlabel("Years")
        mpl.pyplot.ylabel("Toursits arrivals")
        mpl.pyplot.clf()

        #ACF and PACF plots 1(with log differenced training data)
        lag_acf = acf(training_diff, nlags=40)
        lag_pacf = pacf(training_diff, nlags=40, method='ols')

        #plot ACF
        mpl.pyplot.figure(figsize=(15, 5))
        mpl.pyplot.subplot(121)
        mpl.pyplot.stem(lag_acf)
        mpl.pyplot.axhline(y=0, linestyle='-', color='black')
        mpl.pyplot.axhline(y=-1.96 / np.sqrt(len(training)),
                           linestyle='--',
                           color='gray')
        mpl.pyplot.axhline(y=1.96 / np.sqrt(len(training)),
                           linestyle='--',
                           color='gray')
        mpl.pyplot.xlabel('lag')
        mpl.pyplot.ylabel("ACF")
        #storing plots in bytes
        mpl.pyplot.savefig('PredictionEngine/static/img/sarima_afc.png',
                           dpi=600,
                           bbox_inches='tight')
        mpl.pyplot.clf()

        #plot PACF
        mpl.pyplot.figure(figsize=(15, 5))
        mpl.pyplot.subplot(122)
        mpl.pyplot.stem(lag_pacf)
        mpl.pyplot.axhline(y=0, linestyle='-', color='black')
        mpl.pyplot.axhline(y=-1.96 / np.sqrt(len(training)),
                           linestyle='--',
                           color='gray')
        mpl.pyplot.axhline(y=1.96 / np.sqrt(len(training)),
                           linestyle='--',
                           color='gray')
        mpl.pyplot.xlabel('lag')
        mpl.pyplot.ylabel("PACF")
        #storing plots in bytes
        mpl.pyplot.savefig('PredictionEngine/static/img/sarima_pafc.png',
                           dpi=600,
                           bbox_inches='tight')
        mpl.pyplot.clf()

        #SARIMA Model specification
        model = sm.tsa.statespace.SARIMAX(training,
                                          order=(2, 0, 3),
                                          seasonal_order=(2, 1, 0, 12),
                                          trend='c',
                                          enforce_invertibility=False,
                                          enforce_stationarity=False)

        # fit model
        model_fit = model.fit()

        model_fit.save("assets/REG_SARIMA_model.pickle")

        print(model_fit.summary())

        #plot residual errors
        # residuals = pd.DataFrame(model_fit.resid)
        # fig, ax = mpl.pyplot.subplots(1,2)
        # residuals.plot(title="Residuals", ax=ax[0])
        # residuals.plot(kind='kde', title='Density', ax=ax[1])
        # mpl.pyplot.show()
        # print(residuals.describe())

        # Model evaluation and forecast
        model_fitted = load_pickle("assets/REG_SARIMA_model.pickle")
        forecast = model_fitted.forecast(len(df) - 250)
        print(forecast)
        forecast = np.exp(forecast)
        print(forecast)
        #plot forecast results and display RMSE
        mpl.pyplot.figure(figsize=(10, 5))
        mpl.pyplot.plot(forecast, 'r')
        mpl.pyplot.plot(series, 'b')
        mpl.pyplot.legend(['Predicted test values', 'Actual data values'])

        mpl.pyplot.title('RMSE:%.2f' %
                         np.sqrt(sum((forecast - testing)**2) / len(testing)))
        mpl.pyplot.ylabel("No.of Toursits Arrivals Monthly")
        mpl.pyplot.xlabel("Year")
        mpl.pyplot.autoscale(enable='True', axis='x', tight=True)
        mpl.pyplot.axvline(x=series.index[split_point], color='black')
        #storing plots
        mpl.pyplot.savefig('PredictionEngine/static/img/sarima_result.png',
                           dpi=600,
                           bbox_inches='tight')
        mpl.pyplot.clf()

        forecaste = model_fitted.forecast(len(df) - 214)
        forecast_next = forecaste[62:]
        forecast_next = np.exp(forecast_next)
        print(forecast_next)
        mpl.pyplot.figure(figsize=(10, 5))
        mpl.pyplot.plot(forecast_next, 'r')
        mpl.pyplot.plot(series, 'b')
        mpl.pyplot.legend(['Predicted next steps values'])
        mpl.pyplot.title('Monthly tourist arrivals predictions')
        mpl.pyplot.ylabel("No.of Toursits Arrivals ")
        mpl.pyplot.xlabel("Year")
        mpl.pyplot.autoscale(enable='True', axis='x', tight=True)

        #storing plots in bytes
        mpl.pyplot.savefig('PredictionEngine/static/img/sarima_forecast.png',
                           dpi=600,
                           bbox_inches='tight')
        mpl.pyplot.clf()
示例#31
0
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.api import VAR #vector auto reg
from statsmodels.iolib.smpickle import load_pickle

import dataProcessing as dp # for get_capacities, get stations

y_df = pd.read_csv("Velib/dataframe_streaming_1.csv") #Velib/ #??model_fit = VAR.load('ar_model.pkl')

model_fit = load_pickle('ar_model.pkl')

def predict_nextstate(y_df, model_fit):

    df_s_c = pd.read_csv("stations_capacities.csv") #Velib/ TODO once in github
    stations = dp.get_stations(df_s_c)
    capacities = dp.get_capacities(df_s_c)


    #remove dates
    y = y_df.drop(columns=['Date']).to_numpy()
    #y[i][0] c'est la station et y[i][1] c'est le nombre de bikes

    #Buid a vector .. [Y]
    #nombre de temps enregistrés
    times = len(y)/len(stations)
示例#32
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

warnings.filterwarnings("ignore")
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.api import VAR  # vector auto reg
from statsmodels.iolib.smpickle import load_pickle
import time
import dataProcessing as dp  # for get_capacities, get stations

y_df = pd.read_csv("dataframe_streaming.csv"
                   )  # Velib/ #??model_fit = VAR.load('ar_model.pkl')

model_fit = load_pickle('../model/ar_model.pkl')


def predict_nextstate(y_df, model_fit):
    df_s_c = pd.read_csv("../training_csv/stations_capacities.csv"
                         )  # Velib/ TODO once in github
    stations = dp.get_stations()
    capacities = dp.get_capacities(df_s_c)

    # remove dates
    y = y_df.drop(columns=['Date']).to_numpy()

    # Buid a vector .. [Y]
    # nombre de temps enregistrés
    times = len(y) / len(stations)
    print("nombre de temps dans les données de test: ", times)