Python remove_outliers示例

编程语言: Python

命名空间/包名称: utils

方法/功能: remove_outliers

hotexamples.com的示例: 8

Python remove_outliers - 已找到8个示例。这些是从开源项目中提取的最受好评的utils.remove_outliers现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： find_submission_models.py 项目： edgBR/time_series

def compute_and_plot_submission_with_arima(save_path, fig_folder):
    # with this dataset, the index column is day
    data = pd.read_csv("data/train.csv", index_col="Day")

    # changing index to datetime object year - month - day
    data.index = pd.to_datetime(data.index, format="%Y-%m-%d")
    data = data.asfreq('d')

    nbr_series = len(data.columns)
    nbr_samples = data["series-1"].count()

    start_date = data.index[0]
    end_date = data.index[781]

    print("Start date " + str(start_date))
    print("End date " + str(end_date))

    interval_train = pd.date_range(start=start_date, end='2017-07-30')

    # training set without validation for the submission
    interval_train_full = pd.date_range(start=start_date, end=end_date)

    # validation is 21 days
    interval_valid = pd.date_range(start='2017-07-31', end=end_date)

    # test is 21 days
    interval_test = pd.date_range(start='2017-08-21', end='2017-09-10')

    # number of samples we are predicting
    horizon = len(interval_test)

    # separating data into train and validation set
    data_train = data.loc[interval_train]
    data_train_full = data.loc[interval_train_full]
    data_valid = data.loc[interval_valid]
    data_test = pd.DataFrame(index=interval_test)
    data_submit = pd.DataFrame(index=interval_test)
    data_best_arima = pd.DataFrame(index=interval_test)

    # for plotting
    methods_colors = [
        "blue", "red", "cyan", "orange", "pink", "magenta", "grey", "yellow"
    ]

    # to store the chosen model
    chosen_method = []
    chosen_method_smape = []
    chosen_method_param = []

    # to record the smape of each model
    record_smapes = []

    number_of_days = 62

    ARIMA_ORDER_VALID = []
    ARIMA_ORDER_TEST = []

    for i in range(1, nbr_series + 1):

        # to perform comparisons on models
        best_smape = 1000000000
        best_model = None
        best_model_param = None

        nn_models_current_series = models_to_try[i - 1]
        nn_models_params_current_series = models_to_try_parameters[i - 1]
        nbr_models_current_series = len(nn_models_current_series)
        record_smapes.append([])

        # for validation, we plot each validation forecast separately
        # axis 0 is a comparison of each forecast on last 90 days
        # axis 1 is a comparison of each forecast on the whole series
        _, ax = plt.subplots(nrows=nbr_models_current_series + 3,
                             ncols=1,
                             figsize=(12, 10))

        ax[0].set_title(
            'Comparison of model forecast on validation series for series ' +
            str(i) + ' (last ' + str(number_of_days) + ' days)')
        ax[0].plot(data_train['series-' + str(i)][-number_of_days:],
                   color="black",
                   linestyle="-")
        ax[0].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        ax[1].set_title(
            'Comparison of model forecast on validation series for series ' +
            str(i) + ' (whole series without outliers)')
        ax[1].plot(remove_outliers(data_train['series-' + str(i)]),
                   color="black",
                   linestyle="-")
        ax[1].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        axis_legend = [
            "Train series", "Validation series", auto_arima_forecast.__name__
        ]
        legend_size = 6
        for model_name in nn_models_current_series:
            axis_legend.append(model_name.__name__)
        ax[0].legend(axis_legend, prop={'size': legend_size})
        ax[1].legend(axis_legend, prop={'size': legend_size})
        axis_legend = ["Train series", "Validation series"]

        ax[2].set_title(
            'Comparison of model forecast on validation series for series ' +
            str(i) + ' (last ' + str(number_of_days) + ' days)')
        ax[2].plot(data_train['series-' + str(i)][-number_of_days:],
                   color="black",
                   linestyle="-")
        ax[2].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        if i >= 100:
            smape, forecast, order, seasonal_order = auto_arima_forecast(
                data_train['series-' + str(i)],
                data_valid['series-' + str(i)],
                horizon,
                del_outliers=True,
                normalize=True,
                plot=False)
        else:
            order = ARIMA_parameters[i - 1][0]
            seasonal_order = ARIMA_parameters[i - 1][1]
            smape, forecast = arima_forecast(data_train['series-' + str(i)],
                                             data_valid['series-' + str(i)],
                                             horizon,
                                             order,
                                             seasonal_order,
                                             del_outliers=True,
                                             normalize=True,
                                             plot=False)
        ARIMA_ORDER_VALID.append((order, seasonal_order))
        ax[0].plot(forecast, color="steelblue", linestyle="--")
        ax[1].plot(forecast, color="steelblue", linestyle="--")

        ax[2].plot(forecast, color="steelblue", linestyle="--")
        axis_legend_copy = copy.copy(axis_legend)
        axis_legend_copy.append(auto_arima_forecast.__name__ + " " +
                                str(order) + " " + str(seasonal_order) + " " +
                                str(str("{:.2f}".format(smape))))
        ax[2].legend(axis_legend_copy, prop={'size': legend_size})

        record_smapes[i - 1].append(smape)

        if smape < best_smape:
            best_smape = smape
            best_model = auto_arima_forecast
            best_model_param = (order, seasonal_order)

        for model_index in range(nbr_models_current_series):
            ax[model_index + 3].set_title(
                'Comparison of model forecast on validation series for series '
                + str(i) + ' (last ' + str(number_of_days) + ' days)')
            ax[model_index + 3].plot(data_train['series-' +
                                                str(i)][-number_of_days:],
                                     color="black",
                                     linestyle="-")
            ax[model_index + 3].plot(data_valid['series-' + str(i)],
                                     color="green",
                                     linestyle="-")

            smape, forecast = nn_models_current_series[model_index](
                data_train['series-' + str(i)],
                data_valid['series-' + str(i)],
                nn_models_params_current_series[model_index],
                horizon,
                del_outliers=True,
                normalize=True,
                plot=False)

            ax[0].plot(forecast,
                       color=methods_colors[model_index],
                       linestyle="--")
            ax[1].plot(forecast,
                       color=methods_colors[model_index],
                       linestyle="--")

            ax[model_index + 3].plot(forecast,
                                     color=methods_colors[model_index],
                                     linestyle="--")
            axis_legend_copy = copy.copy(axis_legend)
            axis_legend_copy.append(
                nn_models_current_series[model_index].__name__ + " " +
                str(str("{:.2f}".format(smape))))
            ax[model_index + 3].legend(axis_legend_copy,
                                       prop={'size': legend_size})

            record_smapes[i - 1].append(smape)

            if smape < best_smape:
                best_smape = smape
                best_model = nn_models_current_series[model_index]
                best_model_param = nn_models_params_current_series[model_index]

        print("--- SERIES " + str(i) + " SMAPES ---")

        print("   MODEL " + auto_arima_forecast.__name__ + " PARAM " +
              str(ARIMA_ORDER_VALID[i - 1]) + " SMAPE " +
              str(record_smapes[i - 1][0]))
        for model_index in range(nbr_models_current_series):
            print("   MODEL " +
                  nn_models_current_series[model_index].__name__ + " PARAM " +
                  str(nn_models_params_current_series[model_index]) +
                  " SMAPE " + str(record_smapes[i - 1][model_index + 1]))

        plt.tight_layout()
        plt.savefig(fig_folder + 'series-' + str(i) + "-comparison.pdf")
        plt.show()
        """ --------------- now to chosen the best model -----------------------"""

        _, ax = plt.subplots(nrows=nbr_models_current_series + 3,
                             ncols=1,
                             figsize=(12, 10))

        ax[0].set_title(
            'Comparison of model forecast on test series for series ' +
            str(i) + ' (last ' + str(number_of_days) + ' days)')
        ax[0].plot(data_train['series-' + str(i)][-number_of_days:],
                   color="black",
                   linestyle="-")
        ax[0].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        ax[1].set_title(
            'Comparison of model forecast on test series for series ' +
            str(i) + ' (whole series without outliers)')
        ax[1].plot(remove_outliers(data_train['series-' + str(i)]),
                   color="black",
                   linestyle="-")
        ax[1].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        axis_legend_all = ["Train series", "Validation series"]
        legend_size = 6
        axis_legend = ["Train series", "Validation series"]

        ax[2].set_title(
            'Comparison of model forecast on test series for series ' +
            str(i) + ' (last ' + str(number_of_days) + ' days)')
        ax[2].plot(data_train['series-' + str(i)][-number_of_days:],
                   color="black",
                   linestyle="-")
        ax[2].plot(data_valid['series-' + str(i)],
                   color="green",
                   linestyle="-")

        if i >= 100:
            smape, forecast, order, seasonal_order = auto_arima_forecast(
                data_train_full['series-' + str(i)],
                data_test,
                horizon,
                del_outliers=True,
                normalize=True,
                plot=False)

        else:
            order = ARIMA_parameters[i - 1][0]
            seasonal_order = ARIMA_parameters[i - 1][1]
            smape, forecast = arima_forecast(data_train_full['series-' +
                                                             str(i)],
                                             data_test,
                                             horizon,
                                             order,
                                             seasonal_order,
                                             del_outliers=True,
                                             normalize=True,
                                             plot=False)

        ARIMA_ORDER_TEST.append((order, seasonal_order))

        data_best_arima['series-' + str(i)] = forecast

        ax[0].plot(forecast, color="steelblue", linestyle="--")
        ax[1].plot(forecast, color="steelblue", linestyle="--")

        ax[2].plot(forecast, color="steelblue", linestyle="--")

        data_save_method = pd.DataFrame(index=interval_test)
        data_save_method['series-' + str(i)] = forecast
        data_save_method.to_csv(fig_folder + str(i) + "_" +
                                auto_arima_forecast.__name__ + "_" +
                                str(order) + "_" + str(seasonal_order) +
                                ".csv")
        data_save_method = keyvalue(data_save_method)
        data_save_method.to_csv(fig_folder + str(i) + "_formatted_" +
                                auto_arima_forecast.__name__ + "_" +
                                str(order) + "_" + str(seasonal_order) +
                                ".csv")

        if auto_arima_forecast == best_model:
            axis_legend_all.append(auto_arima_forecast.__name__ + str(order) +
                                   " " + str(seasonal_order) + " (chosen)")
            axis_legend_copy = copy.copy(axis_legend)
            axis_legend_copy.append(auto_arima_forecast.__name__ + str(order) +
                                    " " + str(seasonal_order) + " (chosen)")
            ax[2].legend(axis_legend_copy, prop={'size': legend_size})

            data_submit['series-' + str(i)] = forecast

            chosen_method.append(best_model.__name__)
            chosen_method_smape.append(str("{:.2f}".format(best_smape)))
            chosen_method_param.append(
                str(order) + " - " + str(seasonal_order))
        else:
            axis_legend_all.append(auto_arima_forecast.__name__)
            axis_legend_copy = copy.copy(axis_legend)
            axis_legend_copy.append(auto_arima_forecast.__name__)
            ax[2].legend(axis_legend_copy, prop={'size': legend_size})

        for model_index in range(nbr_models_current_series):
            ax[model_index + 3].set_title(
                'Comparison of model forecast on test series for series ' +
                str(i) + ' (last ' + str(number_of_days) + ' days)')
            ax[model_index + 3].plot(data_train['series-' +
                                                str(i)][-number_of_days:],
                                     color="black",
                                     linestyle="-")
            ax[model_index + 3].plot(data_valid['series-' + str(i)],
                                     color="green",
                                     linestyle="-")

            smape, forecast = nn_models_current_series[model_index](
                data_train_full['series-' + str(i)],
                data_test,
                nn_models_params_current_series[model_index],
                horizon,
                del_outliers=True,
                normalize=True,
                plot=False)

            ax[0].plot(forecast,
                       color=methods_colors[model_index],
                       linestyle="--")
            ax[1].plot(forecast,
                       color=methods_colors[model_index],
                       linestyle="--")
            ax[model_index + 3].plot(forecast,
                                     color=methods_colors[model_index],
                                     linestyle="--")

            data_save_method = pd.DataFrame(index=interval_test)
            data_save_method['series-' + str(i)] = forecast
            data_save_method.to_csv(
                fig_folder + str(i) + "_" +
                nn_models_current_series[model_index].__name__ + "_" +
                str(nn_models_params_current_series[model_index]) + ".csv")
            data_save_method = keyvalue(data_save_method)
            data_save_method.to_csv(
                fig_folder + str(i) + "_formatted_" +
                nn_models_current_series[model_index].__name__ + "_" +
                str(nn_models_params_current_series[model_index]) + ".csv")

            if nn_models_current_series[model_index] == best_model:
                axis_legend_all.append(
                    nn_models_current_series[model_index].__name__ +
                    " (chosen)")
                axis_legend_copy = copy.copy(axis_legend)
                axis_legend_copy.append(
                    nn_models_current_series[model_index].__name__ +
                    " (chosen)")
                ax[model_index + 3].legend(axis_legend_copy,
                                           prop={'size': legend_size})

                data_submit['series-' + str(i)] = forecast
                chosen_method.append(best_model.__name__)
                chosen_method_smape.append(str("{:.2f}".format(best_smape)))
                chosen_method_param.append(best_model_param)
            else:
                axis_legend_all.append(
                    nn_models_current_series[model_index].__name__)
                axis_legend_copy = copy.copy(axis_legend)
                axis_legend_copy.append(
                    nn_models_current_series[model_index].__name__)
                ax[model_index + 3].legend(axis_legend_copy,
                                           prop={'size': legend_size})

        ax[0].legend(axis_legend_all, prop={'size': legend_size})
        ax[1].legend(axis_legend_all, prop={'size': legend_size})

        plt.tight_layout()
        plt.savefig(fig_folder + 'series-' + str(i) + "-submission.pdf")
        plt.show()

    print("------------ SUBMISSION ------------")
    print()
    print(data_submit.to_string())
    data_submit.to_csv("data/all_best_nn_nosub.csv")
    print()
    print("------------ FORMATED SUBMISSION ------------")
    submission = keyvalue(data_submit)
    print(submission.to_string())
    submission.to_csv(save_path)
    print("------------ CHOSEN METHODS INFO FOR SUBMISSION ------------")
    print("METHODS " + str(chosen_method))
    print("PARAMS " + str(chosen_method_param))
    print("SMAPES " + str(chosen_method_smape))
    print()
    print("------------ ARIMA ORDERS ------------")
    print("ARIMA ORDER VALID " + str(ARIMA_ORDER_VALID))
    print("ARIMA ORDER TEST " + str(ARIMA_ORDER_TEST))
    print()
    print("------------ ARIMA SUBMISSION AN FORMATED SUBMISSION ------------")
    print()
    print(data_best_arima.to_string())
    data_best_arima.to_csv("data/arima_best.csv")
    print()
    submission = keyvalue(data_best_arima)
    print(submission.to_string())
    submission.to_csv("data/arima_best_submission.csv")
    print()
    print(
        "------------ ALL PREDICTED FORECAST ON ACTUAL DATASETS SAVED TO 'seriesid-nameofmethod-params.csv' ------------"
    )

示例#2

显示文件

文件： feature_engineering.py 项目： flying-marmot/project-utils

def preprocessing():
    """Pipeline for pre-processing data

           Args:
               None

           Returns:
               None
           """

    for sm in load_config['SM']:

        # Initialization of variables
        folder = sm['name']
        drop_cols = defaultdict(list)

        # Pickle load
        with open('./data/' + folder + '/df_' + folder + '.pickle', 'rb') as handle:
            df = pickle.load(handle)

        # Check if DataFrame is not empty
        if df.shape[0] == 0:

            print('Empty DataFrame for: ' + folder)

            continue

        ######################
        # METRIC CALCULATION #
        ######################

        # Calculate metrics and merge into final analysis DataFrame
        df = SQL.metric_calculation(df)

        ##################
        # MISSING VALUES #
        ##################

        # Correct blanks to nan
        df = utils.white_to_nan(df)

        # Check if there are any missing values, if not skip certain operations
        nulls = pd.isnull(df).sum().sum()

        if nulls > 0:

            # Visualize NaN values
            utils.missing_visuals(df)

        # Remove columns with high nan percentage
        old_cols = df.columns
        df = df.dropna(axis=1, thresh=0.9)
        drop_cols['high_nan'].append([i for i in old_cols if i not in df.columns])

        # Remove rows with full nan
        df = df.dropna(axis=0, how='all')

        # Convert to datetime possible date columns
        df = utils.datetime_cols(df)

        # Create synthetic dates
        df = utils.synthetic_dates(df)

        ############
        # IMPUTING #
        ############

        # Initialize object
        dict_impute = {}

        if nulls > 0:

            dict_impute = utils.imputing_nan(df)

        else:

            dict_impute['no_impute'] = df

        ############
        # CLEANING #
        ############

        # For each imputed DataFrame
        for impute in dict_impute.keys():

            df = dict_impute[impute]

            # Remove duplicates
            df = df.drop_duplicates(keep='first')
            df = df.reset_index(drop=True)

            # Remove columns that are unique identifiers
            df, drop_cols = utils.remove_identifier_columns(df, drop_cols)

            # Remove one-value columns
            df, drop_cols = utils.remove_one_value_columns(df, drop_cols)

            # Convert to categorical possible categorical columns
            df, changed_cols = utils.search_categorical(df, 0.001)

            # Remove outliers using zscore
            df = utils.remove_outliers(df, 3)

            # Assign target labels
            df, target_cols = utils.apply_business_rules(df, folder)

            ##################
            # PRE-PROCESSING #
            ##################

            # Pickle save
            with open('./data/' + folder + '/df_' + impute + '_' + folder + '_noOHE.pickle', 'wb') as handle:
                pickle.dump((df, target_cols), handle)

            # Perform One-Hot Encoding for categorical columns
            df, list_new_columns, drop_cols = utils.one_hot_encoding(df, 5, 1, drop_cols)

            # Standarize and Normalize DataFrame
            df = utils.standarize_normalize(df, target_cols, list_new_columns)

            dict_impute[impute] = df

            # Pickle save
            with open('./data/' + folder + '/df_' + impute + '_' + folder + '_OHE.pickle', 'wb') as handle:
                pickle.dump((df, target_cols), handle)

示例#3

显示文件

文件： app.py 项目： faizancodes/Automated-Fundamental-Analysis

st.subheader('Select two Sectors and compare a metric')

sector1 = st.selectbox('Select a Sector', (set(df['Sector'])))

sector2 = st.selectbox('Select a Sector to Compare',
                       (set(df['Sector']) - {sector1}))

metric = st.selectbox('Select a Metric', (selectable_values))

df = df[df[metric] != '-']
df[metric] = pd.to_numeric(df[metric], downcast="float")

sector1_df = df[df['Sector'] == sector1]
sector2_df = df[df['Sector'] == sector2]

sector1_data = ut.remove_outliers(sector1_df, metric, 3.5)
sector2_data = ut.remove_outliers(sector2_df, metric, 3.5)

fig = plt.figure(figsize=(25, 15))
matplotlib.rcParams['axes.grid'] = True
matplotlib.rcParams['savefig.transparent'] = True

custom_style = {
    'axes.labelcolor': 'white',
    'xtick.color': 'white',
    'ytick.color': 'white'
}

sns.set_style({'axes.grid': False})
sns.set_style(rc=custom_style)

示例#4

显示文件

文件： create_data.py 项目： quarriedstone/EEG-analysis

                                'Beta': mne.filter.filter_data(data=np.mean(experiment_filtered.get_data(), axis=0),
                                                               l_freq=IAF_p + 2, h_freq=30, sfreq=sfreq,
                                                               method="fir")}

        # Calculating calibration values. Consider mean value of all channels. Va;ue are given in microvolts
        calibration_values = {}

        for band in WAVES:
            calibration_values[band] = np.mean(eyes_sub_bands[band], axis=0) * np.power(10, 6)

        # Performing STFT transform on experiment data for each sub-band. Window size is given in samples
        window = sfreq * 2
        fft = {}

        for band in WAVES:
            fft[band] = stft(x=experiment_sub_bands[band], fs=sfreq, window=('kaiser', window), nperseg=1000)

        erd = np.vectorize(ERD)
        # Calculating ERD for experiment
        erd_mean = {}

        for band in fft:
            curr_erd = erd(fft[band][2], calibration_values[band])
            erd_mean[band] = remove_outliers(np.real(np.mean(curr_erd, axis=0)))

        # Adding clean Beta and UA energy ratio
        erd_mean["ABratio"] = remove_outliers(np.real(np.power(experiment_sub_bands["UA"] / experiment_sub_bands["Beta"], 2)))

        # Dumping erd_mean of experiment
        pickle.dump(erd_mean, open(obg_dir / subject / "".join([subject, exp_type, ".pkl"]), 'wb'))

示例#5

显示文件

文件： generator_tables.py 项目： marekgr/csit

def table_performance_comparison_mrr(table, input_data):
    """Generate the table(s) with algorithm: table_performance_comparison_mrr
    specified in the specification file.

    :param table: Table to generate.
    :param input_data: Data to process.
    :type table: pandas.Series
    :type input_data: InputData
    """

    logging.info("  Generating the table {0} ...".format(table.get(
        "title", "")))

    # Transform the data
    logging.info("    Creating the data set for the {0} '{1}'.".format(
        table.get("type", ""), table.get("title", "")))
    data = input_data.filter_data(table, continue_on_error=True)

    # Prepare the header of the tables
    try:
        header = [
            "Test case",
            "{0} Throughput [Mpps]".format(table["reference"]["title"]),
            "{0} stdev [Mpps]".format(table["reference"]["title"]),
            "{0} Throughput [Mpps]".format(table["compare"]["title"]),
            "{0} stdev [Mpps]".format(table["compare"]["title"]), "Change [%]"
        ]
        header_str = ",".join(header) + "\n"
    except (AttributeError, KeyError) as err:
        logging.error(
            "The model is invalid, missing parameter: {0}".format(err))
        return

    # Prepare data to the table:
    tbl_dict = dict()
    for job, builds in table["reference"]["data"].items():
        for build in builds:
            for tst_name, tst_data in data[job][str(build)].iteritems():
                if tbl_dict.get(tst_name, None) is None:
                    name = "{0}-{1}".format(
                        tst_data["parent"].split("-")[0],
                        "-".join(tst_data["name"].split("-")[1:]))
                    tbl_dict[tst_name] = {
                        "name": name,
                        "ref-data": list(),
                        "cmp-data": list()
                    }
                try:
                    tbl_dict[tst_name]["ref-data"].\
                        append(tst_data["result"]["throughput"])
                except TypeError:
                    pass  # No data in output.xml for this test

    for job, builds in table["compare"]["data"].items():
        for build in builds:
            for tst_name, tst_data in data[job][str(build)].iteritems():
                try:
                    tbl_dict[tst_name]["cmp-data"].\
                        append(tst_data["result"]["throughput"])
                except KeyError:
                    pass
                except TypeError:
                    tbl_dict.pop(tst_name, None)

    tbl_lst = list()
    for tst_name in tbl_dict.keys():
        item = [
            tbl_dict[tst_name]["name"],
        ]
        if tbl_dict[tst_name]["ref-data"]:
            data_t = remove_outliers(tbl_dict[tst_name]["ref-data"],
                                     outlier_const=table["outlier-const"])
            # TODO: Specify window size.
            if data_t:
                item.append(round(mean(data_t) / 1000000, 2))
                item.append(round(stdev(data_t) / 1000000, 2))
            else:
                item.extend([None, None])
        else:
            item.extend([None, None])
        if tbl_dict[tst_name]["cmp-data"]:
            data_t = remove_outliers(tbl_dict[tst_name]["cmp-data"],
                                     outlier_const=table["outlier-const"])
            # TODO: Specify window size.
            if data_t:
                item.append(round(mean(data_t) / 1000000, 2))
                item.append(round(stdev(data_t) / 1000000, 2))
            else:
                item.extend([None, None])
        else:
            item.extend([None, None])
        if item[1] is not None and item[3] is not None and item[1] != 0:
            item.append(int(relative_change(float(item[1]), float(item[3]))))
        if len(item) == 6:
            tbl_lst.append(item)

    # Sort the table according to the relative change
    tbl_lst.sort(key=lambda rel: rel[-1], reverse=True)

    # Generate tables:
    # All tests in csv:
    tbl_names = [
        "{0}-1t1c-full{1}".format(table["output-file"],
                                  table["output-file-ext"]),
        "{0}-2t2c-full{1}".format(table["output-file"],
                                  table["output-file-ext"]),
        "{0}-4t4c-full{1}".format(table["output-file"],
                                  table["output-file-ext"])
    ]
    for file_name in tbl_names:
        logging.info("      Writing file: '{0}'".format(file_name))
        with open(file_name, "w") as file_handler:
            file_handler.write(header_str)
            for test in tbl_lst:
                if file_name.split("-")[-2] in test[0]:  # cores
                    test[0] = "-".join(test[0].split("-")[:-1])
                    file_handler.write(",".join([str(item)
                                                 for item in test]) + "\n")

    # All tests in txt:
    tbl_names_txt = [
        "{0}-1t1c-full.txt".format(table["output-file"]),
        "{0}-2t2c-full.txt".format(table["output-file"]),
        "{0}-4t4c-full.txt".format(table["output-file"])
    ]

    for i, txt_name in enumerate(tbl_names_txt):
        txt_table = None
        logging.info("      Writing file: '{0}'".format(txt_name))
        with open(tbl_names[i], 'rb') as csv_file:
            csv_content = csv.reader(csv_file, delimiter=',', quotechar='"')
            for row in csv_content:
                if txt_table is None:
                    txt_table = prettytable.PrettyTable(row)
                else:
                    txt_table.add_row(row)
            txt_table.align["Test case"] = "l"
        with open(txt_name, "w") as txt_file:
            txt_file.write(str(txt_table))

示例#6

显示文件

文件： nn_multi_step_forecasting.py 项目： edgBR/time_series

def nn_with_past_outliers_multi_step_forecast(series,
                                              validation_series,
                                              input_length,
                                              horizon,
                                              del_outliers=False,
                                              normalize=False,
                                              plot=False):
    """
    Perform forecasting of a time series using a simple neural network with a single 128 neurons hidden layer.
    The network is trained using samples of shape input_length (corresponding to the last input_length days) to predict
    an array of horizon values (corresponding to horizon days). In this case, the network predicts horizon days at the
    time. Performance of the trained network is assessed on a validation series. The size of the validation series must
    be horizon.

    This function differs from nn_multi_step_forecast as in addition to the last input_length days, we also use horizon
    days at the same period the previous year as an input to the network. In addition, the horizon days from the
    previous year are normalized from the original series and contain the outliers. The hope is to gain information from
    the previous year.

    :param series:
    :param validation_series:
    :param input_length:
    :param horizon:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)
        scaler_bis, working_series_with_outliers = normalize_series(series)
    else:
        scaler = None
        working_series_with_outliers = series

    # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error
    train_series = np.log1p(working_series)

    # we use the last n_steps_in days as input and predict n_steps_out
    n_steps_in, n_steps_out = input_length, horizon

    # split into samples
    train_samples, train_targets = split_sequence_nn_with_past_outliers_multi_step(
        train_series, working_series_with_outliers, n_steps_in, n_steps_out)

    # create the model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=n_steps_in + horizon))

    # we predict n_steps_out values
    model.add(Dense(n_steps_out))

    # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error
    model.compile(optimizer='adam', loss='mae')

    # fit model
    model.fit(train_samples, train_targets, epochs=200, verbose=0)

    # perform prediction

    # input is the last n_steps_in values of the train series (working_series is not log1p transformed)
    # in addition, we prepend the horizon values from the last year
    validation_in_sample = np.log1p(
        np.append(
            np.array(working_series_with_outliers.values[-365:-365 + horizon]),
            np.array(working_series.values[-n_steps_in:])))
    validation_in_sample = validation_in_sample.reshape(
        (1, n_steps_in + horizon))
    validation_forecast = model.predict(validation_in_sample, verbose=0)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = np.expm1(validation_forecast)

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = np.expm1(validation_forecast)

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title(
            "Validation of simple multi step NN with past values and input size "
            + str(n_steps_in) + " output size " + str(n_steps_out))
        plt.show()

    return smape(
        validation_series,
        forecast_dataframe['forecast']), forecast_dataframe['forecast']

示例#7

显示文件

文件： arima_forecasting.py 项目： edgBR/time_series

def arima_forecast(series,
                   validation_series,
                   horizon,
                   order,
                   seasonal_order,
                   del_outliers=False,
                   normalize=False,
                   plot=False):
    """
    Creates an arima model with the provided order and seasonal order and assess performance of the model is on a
    validation series.

    :param series:
    :param validation_series:
    :param horizon:
    :param order:
    :param seasonal_order:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)

    else:
        scaler = None

    # input sequence is our data
    train_series = working_series

    # perform search for best parameters and fit
    model = arima.ARIMA(order=order,
                        seasonal_order=seasonal_order,
                        suppress_warnings=True)

    model.fit(train_series)

    # perform predictions
    f_autoarima = model.predict(n_periods=horizon)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = f_autoarima

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = f_autoarima

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of arima model with order " + str(order) +
                  " seasonal order " + str(seasonal_order))

        plt.show()

    return smape(
        validation_series,
        forecast_dataframe['forecast']), forecast_dataframe['forecast']

示例#8

显示文件

文件： arima_forecasting.py 项目： edgBR/time_series

def auto_arima_forecast(series,
                        validation_series,
                        horizon,
                        del_outliers=False,
                        normalize=False,
                        plot=False):
    """
    Fits an auto arima model from the series to find the best parameters. Performance of the trained model is assessed
    on a validation series.

    :param series:
    :param validation_series:
    :param horizon:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series, order, seasonal_order
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)

    else:
        scaler = None

    # input sequence is our data
    train_series = working_series

    # perform search for best parameters and fit
    model = auto_arima(train_series,
                       seasonal=True,
                       max_D=2,
                       m=7,
                       trace=True,
                       error_action='ignore',
                       suppress_warnings=True,
                       stepwise=True)

    order = model.get_params()['order']
    seasonal_order = model.get_params()['seasonal_order']

    # apparently useless model.fit(train_series)

    # perform predictions
    f_autoarima = model.predict(n_periods=horizon)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = f_autoarima

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = f_autoarima

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of auto arima model")

        plt.show()

    return smape(validation_series, forecast_dataframe['forecast']
                 ), forecast_dataframe['forecast'], order, seasonal_order