Пример #1
0
def traceback_outside(message):

    try:
        print(10 / 0)

    except Exception:
        mylogging.traceback("message")
Пример #2
0
def test_readme():
    import mylogging

    mylogging.set_warnings(debug=1)

    mylogging.warn(
        "Hessian matrix copmputation failed for example",
        caption="RuntimeError on model x",
    )

    print(
        "We can log / warn tracebacks from expected errors and continue runtime."
    )

    try:
        print(10 / 0)

    except ZeroDivisionError:
        mylogging.traceback("Maybe try to use something different than 0.")

    print(
        "Info will not trigger warning, but just print to console (but follows the rule in set_warnings(debug))."
    )

    mylogging.info("I am interesting info")
Пример #3
0
def decompose(data: np.ndarray,
              period: int = 365,
              model: Literal["additive", "multiplicative"] = "additive"):
    """Plot decomposition graph. Analyze if data are seasonal.

    Args:
        data (np.ndarray): Time series data
        period (int, optional): Seasonal interval. Defaults to 365.
        model (Literal["additive", "multiplicative"], optional): 'additive' or 'multiplicative'. Defaults to 'additive'.
    """
    if not misc.GLOBAL_VARS.PLOTS_CONFIGURED:
        misc.setup_plots()

    import matplotlib.pyplot as plt
    from statsmodels.tsa.seasonal import seasonal_decompose

    try:
        decomposition = seasonal_decompose(data, model=model, period=period)

        plt.figure(figsize=(15, 8))
        plt.subplot(4, 1, 1)
        plt.plot(decomposition.observed)
        plt.xlabel("Date")
        plt.ylabel("Real values")

        plt.subplot(4, 1, 2)
        plt.plot(decomposition.trend)
        plt.xlabel("Date")
        plt.ylabel("Trend")

        plt.subplot(4, 1, 3)
        plt.plot(decomposition.seasonal)
        plt.xlabel("Date")
        plt.ylabel("Seasonality")

        plt.subplot(4, 1, 4)
        plt.plot(decomposition.resid)
        plt.xlabel("Date")
        plt.ylabel("Residuals")

        plt.suptitle("Seasonal decomposition", fontsize=20)
        plt.subplots_adjust(top=0.88)

        plt.draw()

    except ValueError:
        mylogging.traceback(
            "Number of samples is probably too low to compute.")
Пример #4
0
def test_logs():

    mylogging.config.TO_FILE = "delete.log"

    errors = []

    def check_log():
        with open("delete.log") as log:
            log_content = log.read()

        os.remove("delete.log")

        if log_content:
            return True
        else:
            return False

    mylogging.info(
        "Hessian matrix copmputation failed for example",
        caption="RuntimeError on model x",
    )

    if not check_log():
        errors.append("Info not created")

    mylogging.warn(
        "Hessian matrix copmputation failed for example",
        caption="RuntimeError on model x",
    )

    if not check_log():
        errors.append("Warning not created")

    try:
        print(10 / 0)

    except Exception:
        mylogging.traceback("Maybe try to use something different than 0")

    if not check_log():
        errors.append("Traceback not created")

    for i in [info_outside, warn_outside, traceback_outside]:
        i("Message")
        if not check_log():
            errors.append("Outside function not working")
Пример #5
0
def print_traceback(caption="Error", message=""):
    """Print python error to user web based GUI.

    Note:
        Suppose exposed function create_alert from Vue.

    Args:
        message (str, optional): Heading of detailed traceback. Defaults to "Error".
    """
    mylogging.traceback(
        f"Catched error - server still running.\n\n{message}",
        caption,
    )
    pyvueeel_internal.eel.create_alert(
        caption,
        message,
        traceback.format_exc(),
        "error",
    )
Пример #6
0
def test_databases():
    client = docker.from_env()
    container = client.containers.run("mssql:latest", ports={1433: 1433}, detach=True)

    time.sleep(50)

    df = pd.DataFrame([range(1, 11), ["Product " + str(i) for i in range(10)]]).T
    df.columns = ["ID", "ProductName"]

    try:
        mdp.database.database_write(
            df,
            server=".",
            database="DemoData",
            table="Products",
            username="******",
            password="******",
            if_exists="replace",
        )

        data = mdp.database.database_load(
            server=".",
            database="DemoData",
            query="""
                SELECT TOP (1000) [ID] ,[ProductName]
                FROM [DemoData].[dbo].[Products]
            """,
            username="******",
            password="******",
        )

    except Exception:
        mylogging.traceback()

    finally:
        container.stop()

    assert len(data) == 10
Пример #7
0
def test_readme_to_file():

    import mylogging

    mylogging.config.TO_FILE = "log.log"  # You can use relative (just log.log)

    # Then it's the same

    mylogging.warn(
        "Hessian matrix copmputation failed for example",
        caption="RuntimeError on model x",
    )

    try:
        print(10 / 0)
    except ZeroDivisionError:
        mylogging.traceback("Maybe try to use something different than 0.")

    mylogging.info("I am interesting info")

    mylogging.config.TO_FILE = False

    os.remove("log.log")
Пример #8
0
def compare_models(
    data_all=None,
    predicted_column: list | tuple | str | None = None,
    config: predictit.configuration.Config | dict | None = None,
    **kwargs,
) -> predictit._result_classes.Comparison:
    """Function that helps to choose appropriate models. It evaluates it on test data and then return results.
    After you know what models are the best, you can use only them in functions predict() or predict_multiple_columns.
    You can define your own test data and find best modules for your process.

    Only data_all and predicted column can be positional.

    Check README or tests for working examples.

    Args:
        data_all ((dict, None)): Dictionary of data name as key and config data field and used column as value
            `{data_1: (my_dataframe, 'column_name_or_index')}` or tuple of data with same predicted
            column configured in config `(my_data[-2000:], my_data[-1000:])`
        predicted_column (list | tuple | str | None, optional): Index of predicted column or it's name (dataframe).
            If list with more values only the first one will be evaluated (use predict_multiple_columns function
            if you need that. Default to None.
        config (predictit.configuration.Config | dict | None, optional): Settings as Config instance or dictionary.
            Check class for what you can use. If None, then default config will be used. Defaults to None.
        **kwargs (dict, optional): There is much more parameters' in this function. Check configuration.py for parameters details.
    """

    from tabulate import tabulate

    if config is None or isinstance(config, dict):
        update_config = config
        config = config_default
        config = config.copy()
        if update_config:
            config.update(update_config)

    elif isinstance(config, predictit.configuration.Config):
        config = config.copy()

    # Edit configuration.py default values with arguments values if exist
    if data_all is not None:
        config.data_all = data_all

    if predicted_column is not None:
        config.predicted_column = predicted_column

    # Edit config.py default values with arguments values if exist
    config.update(kwargs)

    predictit._helpers.logger_init_from_config(config.output.logger_subconfig)

    # Edit config.py default values with arguments values if exist
    config.update({
        "mode": "validate",
        "confidence_interval": None,
        "optimizeit": False,
        "evaluate_type": "preprocessed",
        "print_result_details": False,
        "print_time_table": False,
    })

    # If no data_all inserted, default will be used
    if not config.data_all:
        config.data_all = {
            "sin": (mdp.generate_data.sin(), 0),
            "Sign": (mdp.generate_data.sign(), 0),
            "Random data": (mdp.generate_data.random(), 0),
        }
        mylogging.warn("Test data was used. Setup 'data_all' in config...")

    data_dict = config.data_all
    same_data = False

    if isinstance(data_dict, (list, tuple, np.ndarray)):
        same_data = True
        data_dict = {
            f"Data {i}": (j, config.predicted_column)
            for i, j in enumerate(data_dict)
        }

    optimization_number = len(
        config.optimization_values
    ) if config.variable_optimization.optimization else 1

    results_errors_absolute_array = np.zeros(
        (len(data_dict), optimization_number, len(config.used_models)))
    results_errors_absolute_array.fill(np.nan)
    results_errors_standardized_array = results_errors_absolute_array.copy()

    all_models_results = {}

    for g, (i, j) in enumerate(data_dict.items()):

        config.data = j[0]
        if not same_data:
            config.predicted_column = j[1]

        config.plot_name = i

        try:

            prediction_result = predict(config=config)
            all_models_results[i] = j
            evaluated_matrix = prediction_result.misc.evaluated_matrix
            results_errors_absolute_array[g] = evaluated_matrix

            # Standardize results to be able to have average error through different data
            if np.nanmax(evaluated_matrix) - np.nanmin(evaluated_matrix) > 0:
                results_errors_standardized_array[g] = (
                    evaluated_matrix - np.nanmin(evaluated_matrix)
                ) / (np.nanmax(evaluated_matrix) - np.nanmin(evaluated_matrix))
            else:
                results_errors_standardized_array[g] = evaluated_matrix

        except Exception:
            mylogging.traceback(f"Comparison for data {i} didn't finished.",
                                level="ERROR")

    (
        best_results_errors,
        best_models_optimized_values,
        optimized_values_results_df,
        best_model_name,
        best_optimized_value,
    ) = predictit.analyze.analyze_results(
        results_errors_absolute_array,
        config.variable_optimization.optimization_values,
        config.models.used_models,
        config.prediction.error_criterion,
    )

    (
        best_results_errors_on_standardized_data,
        best_models_optimized_values_on_standardized_data,
        optimized_values_results_on_standardized_data_df,
        best_model_name_on_standardized_data,
        best_optimized_value_on_standardized_data,
    ) = predictit.analyze.analyze_results(
        results_errors_standardized_array,
        config.variable_optimization.optimization_values,
        config.models.used_models,
        config.prediction.error_criterion,
    )

    evaluated_results_df = pd.DataFrame(index=config.used_models)
    evaluated_results_df["Errors average"] = best_results_errors
    evaluated_results_df[
        "Standardized\nerror average"] = best_results_errors_on_standardized_data

    if config.variable_optimization.optimization:
        evaluated_results_df[
            "Best optimized\nvalues"] = best_models_optimized_values
        evaluated_results_df[
            "Standardized best\noptimized values"] = best_models_optimized_values_on_standardized_data

    if config.sort_results_by == "error":
        evaluated_results_df.sort_values("Errors average", inplace=True)

    if config.output.print_subconfig_compare_models.print_table:
        print(
            "\n\nComplete results for comparison"
            "\nTable of complete results. Percentual standardized error is between 0 and 1. If 0, model was the "
            "best on all defined data, 1 means it was the worst.")

    simple_table_df = mdp.misc.edit_table_to_printable(
        evaluated_results_df.drop(
            [
                i for i in [
                    "Standardized\nerror average",
                    "Standardized best\noptimized values",
                ] if i in evaluated_results_df.columns
            ],
            axis=1,
        ).iloc[:config.print_number_of_comparison_models, :].reset_index())
    detailed_table_df = (
        mdp.misc.edit_table_to_printable(evaluated_results_df).
        iloc[:config.print_number_of_comparison_models, :].reset_index())
    tables = predictit._result_classes.Tables(
        simple=tabulate(
            simple_table_df.values,
            headers=simple_table_df.columns,
            **config.table_settings,
        ),
        detailed=tabulate(
            detailed_table_df.values,
            headers=detailed_table_df.columns,
            **config.table_settings,
        ),
        time="To be implemented",
        simple_table_df=simple_table_df,
        detailed_table_df=detailed_table_df,
    )

    if config.output.print_subconfig_compare_models.print_comparison_result_details:

        if best_model_name == best_model_name_on_standardized_data:
            print(f"Best model is {best_model_name}\n")
        else:
            print(
                f"Best model on average error is {best_model_name} and best of standardized errors is {best_model_name_on_standardized_data}\n"
            )

        if config.variable_optimization.optimization:
            if best_optimized_value == best_optimized_value_on_standardized_data:
                print(
                    f"Best optimized value for {config.variable_optimization.optimization_variable} is {best_optimized_value}\n"
                )
            else:
                print(
                    f"Best optimized value on absolute error is {best_optimized_value}"
                )
                print(
                    f"Best optimized value on standardized error is {best_optimized_value_on_standardized_data}\n"
                )

    if config.output.print_subconfig_compare_models.print_comparison_table == "simple":
        print(f"\n{tables.simple}\n")

    elif config.output.print_subconfig_compare_models.print_comparison_table == "detailed":
        print(f"\n{tables.detailed}\n")

    # if config.print_time_table:
    # TODO
    # print(f"\n{tables.time}\n")

    if config.variable_optimization.optimization:

        best_optimized_values_dict = {
            j: best_models_optimized_values[i]
            for i, j in enumerate(config.used_models)
        }

        optimization_result = predictit._result_classes.Optimization(
            optimized_variable=config.variable_optimization.
            optimization_variable,
            optimized_options=config.variable_optimization.optimization_values,
            best_value=best_model_name,
            values_results_df=optimized_values_results_df,
            best_values_for_models=best_optimized_values_dict,
        )

        best_optimized_values_dict_on_standardized_data = {
            j: best_models_optimized_values_on_standardized_data[i]
            for i, j in enumerate(config.used_models)
        }

        optimization_standardized_result = predictit._result_classes.Optimization(
            optimized_variable=config.variable_optimization.
            optimization_variable,
            optimized_options=config.variable_optimization.optimization_values,
            best_value=best_model_name_on_standardized_data,
            values_results_df=optimized_values_results_on_standardized_data_df,
            best_values_for_models=
            best_optimized_values_dict_on_standardized_data,
        )
    else:
        optimization_result = None
        optimization_standardized_result = None

    standardized_result = predictit._result_classes.ComparisonStandardized(
        best_model_name_standardized=best_model_name_on_standardized_data,
        best_optimized_value_standardized=
        best_optimized_value_on_standardized_data,
        optimization_standardized=optimization_standardized_result,
    )
    comparison_result = predictit._result_classes.Comparison(
        results_df=evaluated_results_df,
        best_model_name=best_model_name,
        best_optimized_value=best_optimized_value,
        tables=tables,
        all_models_results=all_models_results,
        standardized_results=standardized_result,
        optimization=optimization_result,
    )

    return comparison_result
Пример #9
0
def predict_multiple_columns(
    data=None,
    predicted_columns: list | tuple | str | None = None,
    freqs: list | tuple | str | None = None,
    config: predictit.configuration.Config | dict | None = None,
    **kwargs,
) -> predictit._result_classes.Multiple:
    """Predict multiple columns and multiple frequencies at once. Use predict function.

    Only data and predicted_columns can be positional.

    Check README or tests for working examples.

    Args:
        data (np.ndarray, pd.DataFrame): Time series. Can be 2-D - more columns.
            !!! In Numpy array use data series as rows, but in dataframe use cols !!!. Defaults to [].
        predicted_columns (list | tuple | str | None, optional): List of indexes of predicted columns or it's names (dataframe).
            Defaults to None.
        freqs (list | tuple | str | None, optional): If date index available, resample data and predict in defined
            time frequency. If None, then value from config will be used. Defaults to [].
        config (predictit.configuration.Config | dict | None, optional): Settings as Config instance or dictionary.
            Check class for what you can use. If None, then default config will be used. Defaults to None.
        **kwargs (dict, optional): There is much more parameters' in this function. Check configuration.py
            for parameters details.

    Returns:
        np.ndarray: All the predicted results.
    """

    if config is None or isinstance(config, dict):
        update_config = config
        config = config_default
        config = config.copy()
        if update_config:
            config.update(update_config)

    elif isinstance(config, predictit.configuration.Config):
        config = config.copy()

    # Edit configuration.py default values with arguments values if exist
    if data is not None:
        config.data = data

    if predicted_columns is not None:
        config.predicted_columns = predicted_columns

    if freqs is not None:
        config.freqs = freqs

    if not config.predicted_columns or not isinstance(config.predicted_columns,
                                                      list):
        raise TypeError(
            mylogging.return_str(
                "predict_multiple function need predicted_columns config value to be list."
            ))

    config.update(kwargs)

    predictit._helpers.logger_init_from_config(config.output.logger_subconfig)

    if not config.data_input.freqs:
        freqs = ["Default frequency"]
    else:
        freqs = config.data_input.freqs

    if config.predicted_columns in ["*", ["*"]]:

        if isinstance(config.data, str):
            config.data = mdp.load_data.load_data(
                config.data,
                header=config.header,
                csv_style=config.csv_style,
                predicted_table=config.predicted_table,
                max_imported_length=config.max_imported_length,
                request_datatype_suffix=config.request_datatype_suffix,
                data_orientation=config.data_orientation,
            )

        config.predicted_columns = mdp.preprocessing.data_consolidation(
            config.data).columns

    results = {}
    best_predictions_dataframes = {}

    for fi, f in enumerate(freqs):

        result_dataframe = pd.DataFrame()

        for ci, c in enumerate(config.predicted_columns):

            config.predicted_column = c
            config.freq = f

            result_name = f"Column: {c}" if len(
                freqs) == 1 else f"Column: {c} - Freq: {f}"

            try:
                results[result_name] = predict(config=config)

                result_dataframe[c] = results[result_name].best_prediction

            except Exception:
                mylogging.traceback(
                    f"Error in making predictions on column {c} and freq {f}",
                    level="ERROR",
                )

        best_predictions_dataframes[f"Freq: {f}"] = result_dataframe

    return predictit._result_classes.Multiple(
        best_predictions_dataframes=best_predictions_dataframes,
        results=results)
Пример #10
0
def predict(
    data=None,
    predicted_column: None | str | int = None,
    config: predictit.configuration.Config | dict | None = None,
    **kwargs,
) -> predictit._result_classes.Result:
    """Make predictions mostly on time-series data. Data input and other config options can be set up in
    configuration.py or overwritten on the fly. Setup can be also done as function input arguments or as command line
    arguments (it will overwrite config values).

    There are working examples in main readme and also in test_it module.

    Function can be configured from with config from configuration, with command line arguments as wel as with
    function parameters. There are only two possible positional parameters - `data` and `predicted_column`. Rest of
    parameters must be named parameters. Params are not documented here, because all config params works here in
    function passed as kwargs.

    Args:
        data (np.ndarray, pd.DataFrame, str): Time series. Can be 2-D - more columns. Can be numpy array, DataFrame,
            path to file or url.
            Examples: "/home/user/my.json", or "https://yoururl/your.csv" or np.random.randn(100, 2).
        predicted_column (None | str | int, optional): Index of predicted column or it's name (dataframe).
            If list with more values only the first one will be evaluated (use predict_multiple_columns function
            if you need that. Default to None.
        config (predictit.configuration.Config | dict | None, optional): Settings as Config instance or dictionary.
            Check class for what you can use. If None, then default config will be used. Defaults to None.
        **kwargs (dict, optional): There is much more parameters' of predict function. Check configuration.py
            for parameters details.

    Returns:
        Depend on 'return_type' config value - return best prediction {np.ndarray}, all models results {np.ndarray},
        detailed results{dict} or interactive plot or print tables of results

    """

    from mypythontools.plots import plot

    if config is None or isinstance(config, dict):
        update_config = config
        config = config_default
        config = config.copy()
        if update_config:
            config.update(update_config)

    elif isinstance(config, predictit.configuration.Config):
        config = config.copy()

    if config.use_config_preset and config.use_config_preset != "none":
        updated_config = config.presets[config.use_config_preset]
        config.update(updated_config)

    # Edit configuration.py default values with arguments values if exist
    if data is not None:
        config.data = data

    if predicted_column is not None:
        config.predicted_column = predicted_column

    config.update(kwargs)

    predictit._helpers.logger_init_from_config(config.output.logger_subconfig)

    # Do not repeat actually mean evaluate once
    if not config.repeatit:
        config.repeatit = 1

    _GUI = GLOBAL_VARS.GUI

    # Add everything printed + warnings to variable to be able to print in GUI
    if _GUI:
        stdout = sys.stdout
        sys.stdout = io.StringIO()

    # Don't want to define in gui condition, so if not gui, do nothing
    if _GUI:

        def update_gui(content, html_id):
            try:
                predictit.gui_start.edit_gui_py(content, html_id)
            except (Exception, ):
                pass

    else:

        def update_gui(content, html_id):
            pass

    # Definition of the table for spent time on code parts
    time_df = []

    def update_time_table(time_last):
        time_df.append([progress_phase, round((time.time() - time_last), 3)])
        return time.time()

    time_point = time_begin = time.time()

    ###############
    ### ANCHOR ### Data
    #############

    progress_phase = "Data loading and preprocessing"
    update_gui(progress_phase, "progress_phase")

    data = mdp.load_data.load_data(
        config.data,
        header=config.header,
        csv_style=config.csv_style,
        predicted_table=config.predicted_table,
        max_imported_length=config.max_imported_length,
        request_datatype_suffix=config.request_datatype_suffix,
        data_orientation=config.data_orientation,
    )

    ###############
    ### ANCHOR ### Data consolidation
    #############

    if not config.predicted_column:
        config.predicted_column = 0

    data_for_predictions_df = mdp.preprocessing.data_consolidation(
        data,
        predicted_column=config.predicted_column,
        other_columns=config.other_columns,
        datalength=config.datalength,
        datetime_column=config.datetime_column,
        unique_threshold=config.unique_threshold,
        embedding=config.embedding,
        freq=config.freq,
        resample_function=config.resample_function,
        remove_nans_threshold=config.remove_nans_threshold,
        remove_nans_or_replace=config.remove_nans_or_replace,
        dtype=config.dtype,
    )

    # In data consolidation predicted column was replaced on index 0 as first column
    predicted_column_index = 0
    predicted_column_name = data_for_predictions_df.columns[0]

    ###############
    ### ANCHOR ### Analyze original data
    #############

    column_for_predictions_series = data_for_predictions_df.iloc[:, 0:1]
    results = {}
    data_inputs = []

    if config.mode == "validate":
        column_for_predictions_series = column_for_predictions_series.iloc[:
                                                                           -config
                                                                           .
                                                                           output
                                                                           .
                                                                           predicts, :]
        config.repeatit = 1

    for i in config.used_models:
        data_inputs.append(config.models_input[i])
    data_inputs = set(data_inputs)

    if config.analyzeit == 1 or config.analyzeit == 3:
        print("Analyze of unprocessed data")
        try:
            predictit.analyze.analyze_column(data_for_predictions_df.values[:,
                                                                            0],
                                             window=30)
            predictit.analyze.analyze_data(data_for_predictions_df)
            predictit.analyze.decompose(
                data_for_predictions_df.values[:, 0],
                **config.analyze_seasonal_decompose,
            )
        except Exception:
            mylogging.traceback("Analyze failed", level="ERROR")

    semaphor = None

    if config.multiprocessing:

        multiprocessing.freeze_support()

        if not config.processes_limit:
            config.processes_limit = multiprocessing.cpu_count() - 1

        if config.multiprocessing == "process":
            pipes = []
            semaphor = multiprocessing.Semaphore(config.processes_limit)

        elif config.multiprocessing == "pool":
            pool = multiprocessing.Pool(config.processes_limit)

            # It is not possible easy share data in multiprocessing, so results are resulted via callback function
            def return_result(result):
                for i, j in result.items():
                    results[i] = j

    ### Optimization loop

    if (not config.optimization or not config.optimization_variable
            or not config.optimization_values
            or len(config.optimization_values) == 1):
        config.variable_optimization.optimization = False
        config.optimization_values = ["Not optimized"]
        config.optimization_variable = None

    time_point = update_time_table(time_point)
    progress_phase = "Predict"
    update_gui(progress_phase, "progress_phase")

    models_indexed = {i: j for i, j in enumerate(config.used_models)}

    ###############
    ### ANCHOR ### Main loop
    #############

    for optimization_index, optimization_value in enumerate(
            config.optimization_values):

        # TODO check why setattr - may be wrong after config change
        if config.optimization_variable:
            setattr(config, config.optimization_variable, optimization_value)

        ###############
        ### ANCHOR ### Feature extraction
        #############

        if config.add_fft_columns:
            data_for_predictions_df = mdp.feature_engineering.add_frequency_columns(
                data_for_predictions_df,
                window=config.feature_engineering.add_fft_columns,
            )

        if config.data_extension:
            data_for_predictions_df = mdp.feature_engineering.add_derived_columns(
                data_for_predictions_df,
                **config.feature_engineering.data_extension)

            ###############
            ### ANCHOR ### Feature selection
            #############

            # data_for_predictions_df TODO

            ###############
            ### ANCHOR ### Data preprocessing
            #############

        if config.mode == "validate":
            test_unstandardized = mdp.misc.split(
                data_for_predictions_df, predicts=config.predicts)[1].values
            models_test_outputs_unstandardized = [test_unstandardized]

        else:
            models_test_outputs_unstandardized = mdp.create_model_inputs.create_tests_outputs(
                data_for_predictions_df.values[:, 0],
                predicts=config.predicts,
                repeatit=config.repeatit,
            )

        data_for_predictions, last_undiff_value, final_scaler = mdp.preprocessing.preprocess_data(
            data_for_predictions_df.values,
            remove_outliers=config.remove_outliers,
            smoothit=config.smoothit,
            correlation_threshold=config.correlation_threshold,
            data_transform=config.data_transform,
            standardizeit=config.standardizeit,
            bins=config.bins,
            binning_type=config.binning_type,
        )

        data_for_predictions = cast(np.ndarray, data_for_predictions)

        if config.mode == "validate":
            data_for_predictions, test = mdp.misc.split(
                data_for_predictions, predicts=config.predicts)
            models_test_outputs = [test]

        else:
            models_test_outputs = mdp.create_model_inputs.create_tests_outputs(
                data_for_predictions[:, 0],
                predicts=config.predicts,
                repeatit=config.repeatit,
            )

        column_for_predictions_processed = data_for_predictions[:,
                                                                predicted_column_index]

        data_shape = np.shape(data_for_predictions)
        data_length = len(column_for_predictions_processed)

        data_std = np.std(column_for_predictions_processed[-30:])
        data_mean = np.mean(column_for_predictions_processed[-30:])
        data_abs_max = max(
            abs(column_for_predictions_processed.min()),
            abs(column_for_predictions_processed.max()),
        )

        multicolumn = 0 if data_shape[1] == 1 else 1

        if (config.analyzeit == 2 or config.analyzeit == 3
            ) and optimization_index == len(config.optimization_values) - 1:

            print("\n\nAnalyze of preprocessed data\n")
            try:
                predictit.analyze.analyze_column(
                    column_for_predictions_processed, window=30)
                predictit.analyze.analyze_data(data_for_predictions)
                predictit.analyze.decompose(
                    column_for_predictions_processed,
                    **config.analyze_seasonal_decompose,
                )

            except Exception:
                mylogging.traceback("Analyze failed", level="ERROR")

        min_data_length = 3 * config.predicts + config.default_n_steps_in

        if (data_length < min_data_length or data_length <
                config.repeatit + config.default_n_steps_in + config.predicts):
            config.repeatit = 1
            min_data_length = 3 * config.predicts + config.default_n_steps_in

        assert min_data_length < data_length, mylogging.return_str(
            "Set up less predicted values in settings or add more data",
            caption="To few data",
        )

        for data_inputs_name in data_inputs:
            try:
                (
                    model_train_input,
                    model_predict_input,
                    model_test_inputs,
                ) = mdp.create_model_inputs.create_inputs(
                    data_for_predictions,
                    input_type_name=data_inputs_name,
                    input_type_params=config.data_inputs[data_inputs_name],
                    mode=config.mode,
                    predicts=config.predicts,
                    repeatit=config.repeatit,
                    predicted_column_index=predicted_column_index,
                )

            except Exception:
                mylogging.traceback(
                    f"Error in creating input type: {data_inputs_name} with option optimization: {optimization_value}",
                    level="WARNING",
                )
                continue

            for (iterated_model_index,
                 iterated_model_name) in models_indexed.items():
                iterated_model = predictit.models.models_assignment[
                    iterated_model_name]

                if config.models_input[
                        iterated_model_name] == data_inputs_name:

                    predict_parameters = {
                        "config": config.get_dict(),
                        # Functions to not import all modules
                        "preprocess_data_inverse":
                        mdp.preprocessing.preprocess_data_inverse,
                        "fitted_power_transform":
                        mdp.preprocessing.fitted_power_transform,
                        # Other
                        "iterated_model_train": iterated_model.train,
                        "iterated_model_predict": iterated_model.predict,
                        "iterated_model_name": iterated_model_name,
                        "iterated_model_index": iterated_model_index,
                        "optimization_index": optimization_index,
                        "optimization_value": optimization_value,
                        "model_train_input": model_train_input,
                        "model_predict_input": model_predict_input,
                        "model_test_inputs": model_test_inputs,
                        "models_test_outputs": models_test_outputs,
                        "models_test_outputs_unstandardized":
                        models_test_outputs_unstandardized,
                        "data_abs_max": data_abs_max,
                        "data_mean": data_mean,
                        "data_std": data_std,
                        "last_undiff_value": last_undiff_value,
                        "final_scaler": final_scaler,
                        "semaphor": semaphor,
                    }

                    if config.models_input[iterated_model_name] in [
                            "one_step",
                            "one_step_constant",
                    ]:
                        if multicolumn and config.predicts > 1:
                            mylogging.warn(
                                f"Warning in model {iterated_model_name} \n\nOne-step prediction on "
                                "multivariate data (more columns). Use multi_step (y lengt equals to predict) "
                                "or do use some one column data input in config models_input or predict just one value."
                            )
                            continue

                    if config.multiprocessing == "process":

                        pipes.append(multiprocessing.Pipe(duplex=False))
                        p = multiprocessing.Process(
                            target=predictit._main_loop.train_and_predict,
                            kwargs={
                                **predict_parameters,
                                **{
                                    "pipe": pipes[-1][1]
                                }
                            },
                        )

                        p.Daemon = True  # Baby process will be terminated if parent killed
                        p.start()

                    elif config.multiprocessing == "pool":

                        pool.apply_async(
                            predictit._main_loop.train_and_predict,
                            (),
                            predict_parameters,
                            callback=return_result,
                        )

                    else:
                        results = {
                            **results,
                            **predictit._main_loop.train_and_predict(**predict_parameters),
                        }

    if config.multiprocessing:
        if config.multiprocessing == "process":
            for i in pipes:
                try:
                    results = {**results, **i[0].recv()}
                except Exception:
                    pass

        if config.multiprocessing == "pool":
            pool.close()
            pool.join()

        for i in results.values():
            mylogging.my_logger.log_and_warn_from_lists(
                i["logs_list"], i["warnings_list"])

    # Create confidence intervals
    if config.confidence_interval:
        try:
            lower_bound, upper_bound = predictit.misc.confidence_interval(
                column_for_predictions_series.values,
                predicts=config.predicts,
                confidence=config.confidence_interval,
            )

            grey_area = ["Lower bound", "Upper bound"]
            bounds = True
        except Exception:
            bounds = False
            grey_area = ["Lower bound", "Upper bound"]
            mylogging.traceback("Error in compute confidence interval",
                                level="ERROR")

    else:
        bounds = False
        grey_area = False

    ###############
    ### ANCHOR ### Results processing
    #############

    # Criterion is the best of average from repetitions
    time_point = update_time_table(time_point)
    progress_phase = "Evaluation"
    update_gui(progress_phase, "progress_phase")

    # Two kind of results we will create. Both as dataframe
    #   - First are all the details around prediction. Model errors, time, memory peak etc.
    #   - Second we have predicted values

    # Results such as trained model etc. that cannot be displayed in dataframe are in original results dict.

    # Convert results from dictionary to dataframe - exclude objects like trained model

    results_df = pd.DataFrame.from_dict(results, orient="index")

    if results_df.empty:
        raise RuntimeError(
            mylogging.return_str(
                "None of models finished predictions. Set config.logger_level = 'DEBUG' for more info.",
                caption="All models failed for some reason",
            ))

    evaluated_matrix = np.zeros(
        (1, len(config.optimization_values), len(config.used_models)))
    evaluated_matrix.fill(np.nan)

    for k in results.values():
        evaluated_matrix[0, k["Index"][0], k["Index"][1]] = k["Model error"]

    (
        _,
        best_models_optimized_values,
        optimized_values_results_df,
        best_model_name,
        best_optimized_value,
    ) = predictit.analyze.analyze_results(
        evaluated_matrix,
        config.optimization_values,
        config.models.used_models,
        config.prediction.error_criterion,
    )

    # Generate date indexes for result predictions
    last_date = column_for_predictions_series.index[-1]

    if isinstance(
            last_date,
        (pd.core.indexes.datetimes.DatetimeIndex,
         pd._libs.tslibs.timestamps.Timestamp),
    ):
        date_index = pd.date_range(
            start=last_date,
            periods=config.predicts + 1,
            freq=column_for_predictions_series.index.freq,
        )[1:]
        date_index = pd.to_datetime(date_index)

    else:
        date_index = list(range(last_date + 1,
                                last_date + config.predicts + 1))

    predictions_df = pd.DataFrame(index=date_index)

    results_df.sort_values("Model error", inplace=True)

    for i, row in results_df.iterrows():
        predictions_df[i] = row["Results"]

    if predictions_df.empty:
        raise RuntimeError(
            mylogging.return_str(
                "Neither of models finished prediction. Set config.logger_level = 'DEBUG' for more info."
            ))

    if config.variable_optimization.optimization:
        best_optimized_values_dict = {
            j: best_models_optimized_values[i]
            for i, j in enumerate(config.used_models)
        }

        best_indexes = []

        for i, row in results_df.iterrows():
            if row["Optimization value"] == best_optimized_values_dict[
                    row["Name"]]:
                best_indexes.append(i)

        optimization_result = predictit._result_classes.Optimization(
            optimized_variable=config.variable_optimization.
            optimization_variable,
            optimized_options=config.variable_optimization.optimization_values,
            best_value=best_optimized_value,
            values_results_df=optimized_values_results_df,
            best_values_for_models=best_optimized_values_dict,
            all_models_results_df=results_df,
            all_models_predictions_df=predictions_df,
        )

        predictions_df = predictions_df[best_indexes]
        predictions_df.columns = [
            results_df.loc[i]["Name"] for i in predictions_df.columns
        ]
        results_df = results_df.loc[best_indexes]

        results_df.rename(columns={"A": "Col_1"}, inplace=True)

    else:
        optimization_result = None

    if config.hyperparameter_optimization.optimizeit:
        hyperparameter_optimization_kwargs = results_df[
            "Best optimized parameters"].to_dict()
    else:
        hyperparameter_optimization_kwargs = None

    results_df.set_index("Name", inplace=True)

    results_to_drop = [
        i for i in [
            "Index",
            "Trained model",
            "Test errors",
            "Results",
            "logs_list",
            "warnings_list",
        ] if i in results_df.columns
    ]
    results_df.drop(columns=results_to_drop, inplace=True)

    best_model_predicts = predictions_df[best_model_name]

    ###############
    ### ANCHOR ### Plot
    #############

    if config.variable_optimization.optimization and config.variable_optimization.plot_all_optimized_models:
        predictions_for_plot = optimization_result.all_models_predictions_df.copy(
        )
    else:
        predictions_for_plot = predictions_df.copy()

    predictions_for_plot.columns = [
        f"{i + 1} - {j}" for i, j in enumerate(predictions_for_plot.columns)
    ]

    if config.mode == "validate":
        best_model_name_plot = "Test"
        predictions_df.insert(0, "Test", test_unstandardized)
        predictions_for_plot.insert(0, "Test", test_unstandardized)

    else:
        best_model_name_plot = predictions_for_plot.columns[0]

    bounds_df = pd.DataFrame(index=date_index)

    if bounds:
        bounds_df["Upper bound"] = upper_bound
        bounds_df["Lower bound"] = lower_bound

    last_value = float(column_for_predictions_series.iloc[-1, 0])

    predictions_for_plot_limited = pd.concat(
        [
            predictions_for_plot.iloc[:, :config.plot_number_of_models],
            bounds_df
        ],
        axis=1,
    )

    predictions_with_history = pd.concat(
        [
            column_for_predictions_series[-config.plot_history_length:],
            predictions_for_plot_limited,
        ],
        sort=False,
    )
    predictions_with_history.iloc[-config.predicts - 1, :] = last_value

    if config.sort_results_by == "name":
        results_df.sort_index(key=lambda x: x.str.lower(), inplace=True)
        predictions_df.sort_index(key=lambda x: x.str.lower(), inplace=True)

    if config.general.analyzeit:
        import matplotlib.pyplot as plt

        plt.show()

    time_point = update_time_table(time_point)
    progress_phase = "plot"
    update_gui(progress_phase, "progress_phase")

    if config.output.plot_subconfig.show_plot or config.output.plot_subconfig.save_plot:

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ResourceWarning)

            return_div = True if _GUI else False

            if config.plot_type == "with_history":
                div = plot(
                    predictions_with_history,
                    plot_library=config.plot_library,
                    plot_name=config.plot_name,
                    legend=config.plot_legend,
                    highlighted_column=predicted_column_name,
                    surrounded_column=best_model_name_plot,
                    grey_area=grey_area,
                    save=config.save_plot,
                    return_div=return_div,
                    show=config.output.plot_subconfig.show_plot,
                )

            elif config.plot_type == "just_results":
                div = plot(
                    predictions_for_plot,
                    plot_library=config.plot_library,
                    legend=config.plot_legend,
                    highlighted_column=best_model_name_plot,
                    save=config.save_plot,
                    show=config.output.plot_subconfig.show_plot,
                )

    update_time_table(time_point)
    progress_phase = "Completed"
    update_gui(progress_phase, "progress_phase")

    ###############
    ### ANCHOR ### Table
    #############

    time_df.append(["Complete time", round((time.time() - time_begin), 3)])
    time_df = pd.DataFrame(time_df, columns=["Part", "Time"])

    simple_table_df = mdp.misc.edit_table_to_printable(results_df[[
        "Model error"
    ]].iloc[:config.print_number_of_models, :].reset_index())

    detailed_table_df = results_df.iloc[:config.
                                        print_number_of_models, :].reset_index(
                                        )
    detailed_table_df.drop(["Unstandardized model error"],
                           axis=1,
                           inplace=True)
    detailed_table_df = mdp.misc.edit_table_to_printable(detailed_table_df)

    tables = predictit._result_classes.Tables(
        simple=tabulate(
            simple_table_df.values,
            headers=["Model", f"Average {config.error_criterion} error"],
            **config.table_settings,
        ),
        detailed=tabulate(
            detailed_table_df.values,
            headers=detailed_table_df.columns,
            **config.table_settings,
        ),
        time=tabulate(time_df.values,
                      headers=time_df.columns,
                      **config.table_settings),
        simple_table_df=simple_table_df,
        detailed_table_df=detailed_table_df,
    )

    ###############
    ### ANCHOR ### Results
    #############

    misc_result = predictit._result_classes.Misc(
        evaluated_matrix=evaluated_matrix)
    result = predictit._result_classes.Result(
        best_prediction=best_model_predicts,
        best_model_name=best_model_name,
        predictions=predictions_df,
        results_df=results_df,
        results=results,
        with_history=predictions_with_history,
        tables=tables,
        config=config,
        misc=misc_result,
        optimization=optimization_result,
        hyperparameter_optimization_kwargs=hyperparameter_optimization_kwargs,
    )

    ###############
    ### ANCHOR ### Print
    #############

    if config.print_result_details:
        print((
            f"\nBest model is {best_model_name} with results \n\n{best_model_predicts}\n\nWith model error {config.error_criterion} = "
            f"{results_df.loc[best_model_name, 'Model error']}"))

    if config.print_table == "simple":
        print(f"\n{tables.simple}\n")

    elif config.print_table == "detailed":
        print(f"\n{tables.detailed}\n")

    if config.print_time_table:
        print(f"\n{tables.time}\n")

    ###############
    ### ANCHOR ### Return
    #############

    mylogging.reset_outer_warnings_filter()

    # Return stdout and stop collect warnings and printed output
    if _GUI:
        output = sys.stdout.getvalue()
        sys.stdout = stdout
        result.output = output
        print(output)

    if _GUI:
        result.plot = div

    if config.return_internal_results:
        return {
            "data_for_predictions (X, y)": data_for_predictions,
            "model_train_input": model_train_input,
            "model_predict_input": model_predict_input,
            "model_test_inputs": model_test_inputs,
            "models_test_outputs": models_test_outputs,
        }

    return result
Пример #11
0
def optimize(
    model_train: Callable,
    model_predict: Callable,
    kwargs: dict[str, Any],
    kwargs_limits: dict[str, Any],
    model_train_input: Inputs,
    model_test_inputs: list | np.ndarray,
    models_test_outputs: np.ndarray,
    error_criterion: str = "mape",
    fragments: int = 10,
    iterations: int = 3,
    details: int = 0,
    time_limit: int | float = 5,
    name: str = "Your model",
    plot: bool = False,
) -> None | dict[str, Any]:
    """Function to find optimal parameters of function. For example if we want to find minimum of function x^2,
    we can use limits from -10 to 10. If we have 4 fragments and 3 iterations. it will separate interval on 4 parts,
    so we have approximately points -10, -4, 4, 10. We evaluate the best one and make new interval to closest points,
    so new interval will ber -4 and 4. We divide again into 4 points. We repeat as many times as iterations variable
    defined.

    Note: If limits are written as int, it will be used only as int, so if you want to use float, write -10.0,
    10.0 etc... If you want to define concrete values to be evaluated, just use list of more than 2 values (also you
    can use strings).

    If we have many arguments, it will create many combinations of parameters, so beware, it can be very
    computationally intensive...

    Args:
        model_train (Callable): Model train function (eg: ridgeregression.train).
        model_predict (Callable): Model predict function (eg: ridgeregression.predict).
        kwargs (dict[str, Any]): Initial arguments (eg: {"alpha": 0.1, "n_steps_in": 10}).
        kwargs_limits (dict[str, Any]): Bounds of arguments (eg: {"alpha": [0.1, 1], "n_steps_in":[2, 30]}).
        model_train_input (Inputs): Data on which function is
            optimized. Use train data or sequences (tuple with (X, y, x_input)) - depends on model. Defaults to None.
        model_test_inputs (list | np.ndarray): Error criterion is evaluated to
            be able to compare results. It has to be out of sample data, so data from test set.
        models_test_outputs (np.ndarray): Test set outputs.
        error_criterion (str, optional): Error criterion used in evaluation. 'rmse' or 'mape'. Defaults to 'mape'.
        fragments (int, optional): Number of optimized intervals. Defaults to 10.
        iterations (int, optional): How many times will be initial interval divided into fragments. Defaults to 3.
        details (int, optional): 0 print nothing, 1 print best parameters of models, 2 print every new best parameters
            achieved, 3 prints all results. Bigger than 0 print percents of progress. Defaults to 0.
        time_limit (int | float, optional): How many seconds can one evaluation last. Defaults to 5.
        name (str, optional): Name of model to be displayed in details. Defaults to 'your model'.
        plot (bool, optional): It's possible to plot all parameters combinations to analyze it's influence.
            Defaults to False.

    Returns:
        dict: Optimized parameters of model.

    """

    kwargs_fragments = {}
    constant_kwargs = ({
        key: value
        for (key, value) in kwargs.items() if key not in kwargs_limits
    } if kwargs else {})
    kwargs = {
        key: value
        for (key, value) in kwargs.items() if key not in constant_kwargs
    } if kwargs else {}

    n_test_samples = models_test_outputs.shape[0]
    predicts = models_test_outputs.shape[1]
    last_best_params = {}
    last_printed_time = time.time()

    def evaluatemodel(model_kwargs):
        """Evaluate error function for optimize function.

        Args:
            model_kwargs (dict): Arguments of model

        Returns:
            float: MAPE or RMSE depends on optimize function argument

        """

        modeleval = np.zeros(n_test_samples)

        try:
            trained_model = model_train(model_train_input, **constant_kwargs,
                                        **model_kwargs)

            for repeat_iteration in range(n_test_samples):

                create_plot = True if plot and repeat_iteration == n_test_samples - 1 else False

                predictions = model_predict(
                    model_test_inputs[repeat_iteration],
                    trained_model,
                    predicts=predicts,
                )
                modeleval[
                    repeat_iteration] = evaluate_predictions.compare_predicted_to_test(
                        predictions,
                        models_test_outputs[repeat_iteration],
                        error_criterion=error_criterion,
                        model_name=f"{name} - {model_kwargs}",
                        plot=create_plot,
                    )

            return np.mean(modeleval)

        except (Exception, ):
            return np.inf

    # Test default parameters (can be the best)
    best_result = evaluatemodel(kwargs)

    if best_result != np.inf:
        best_params = kwargs
    else:
        best_params = {}

    if details > 0:
        print(
            f"\n\nOptimization of model {name}:\n\n  Default parameters result: {best_result}\n"
        )

    # If result isn't better during iteration, return results
    memory_result = 0

    all_combinations = []

    for i, j in kwargs_limits.items():

        if not isinstance(j[0], (int, float, np.ndarray)) or len(j) != 2:
            kwargs_fragments[i] = j
        elif isinstance(j[0], int):
            help_var = np.linspace(j[0], j[1], fragments, dtype=int)
            kwargs_fragments[i] = list(set([int(round(j)) for j in help_var]))
        else:
            kwargs_fragments[i] = np.unique(np.linspace(j[0], j[1], fragments))

    for iteration in range(iterations):
        if details > 0:
            print(f"    Iteration {iteration + 1} / {iterations} results: \n")

        combinations = list(itertools.product(*kwargs_fragments.values()))

        combi_len = len(combinations)
        percent = round(combi_len / 100, 1)

        list_of_combinations = []
        for j in combinations:
            combination_dict = {
                key: value
                for (key, value) in zip(kwargs_limits.keys(), j)
            }
            list_of_combinations.append(combination_dict)

        counter = 0
        for k, combination in enumerate(combinations):
            counter += 1

            if combination in all_combinations:
                continue

            all_combinations.append(combination)

            try:
                if time_limit:
                    res = watchdog(time_limit, evaluatemodel,
                                   list_of_combinations[k])
                else:
                    res = evaluatemodel(list_of_combinations[k])

                if res is not None and res is not np.nan and res < best_result:
                    best_result = res
                    best_params = list_of_combinations[k]

                    if details == 2:
                        print(
                            f"\n  New best result {best_result} with parameters: \t {best_params}\n"
                        )

            except (Exception, ):
                if details > 0:
                    mylogging.traceback(
                        f"Error on model {name}: with params {list_of_combinations[k]}"
                    )
                res = np.nan

            finally:

                if details == 3:
                    print(
                        f"    {res}  with parameters:  {list_of_combinations[k]}"
                    )

                if details > 0 and percent > 0 and counter % 10 == 1 and time.time(
                ) - last_printed_time > 3:
                    print(f"\tOptimization is in {int(counter / percent)} %")
                    last_printed_time = time.time()

        if last_best_params != best_params and (
                memory_result != 0 and (memory_result - best_result) < 10e-6):
            if details > 0:
                print((
                    f"  Optimization stopped, because converged. "
                    "Best result {best_result} with parameters {best_params}"))
            return best_params

        # If last iteration, do not create intervals
        elif iteration + 1 == iterations:
            if details > 0:
                print(
                    f"  Optimization finished. Best result {best_result} with parameters {best_params}"
                )
            return best_params

        # None of params combinations finished
        elif not best_params:
            if details > 0:
                print(
                    f"  Optimization failed. None of parameters combinations finished."
                )
            return best_params

        memory_result = best_result
        last_best_params = best_params

        for i, j in kwargs_limits.items():

            if not isinstance(j[0], (int, float, np.ndarray)) or len(j) != 2:
                kwargs_fragments[i] = [best_params[i]]
            else:
                step = (max(kwargs_fragments[i]) -
                        min(kwargs_fragments[i])) / fragments
                if best_params[i] - step < j[0]:
                    kwargs_fragments[i] = np.linspace(best_params[i],
                                                      best_params[i] + step,
                                                      fragments)
                elif best_params[i] + step > j[1]:
                    kwargs_fragments[i] = np.linspace(best_params[i] - step,
                                                      best_params[i],
                                                      fragments)
                else:
                    kwargs_fragments[i] = np.linspace(best_params[i] - step,
                                                      best_params[i] + step,
                                                      fragments)
                kwargs_fragments[i] = np.unique(kwargs_fragments[i])
                if isinstance(j[0], int):
                    kwargs_fragments[i] = list(
                        set([int(round(k)) for k in kwargs_fragments[i]]))
Пример #12
0
def train_and_predict(
    config,
    # Functions to not import all modules
    preprocess_data_inverse,
    fitted_power_transform,
    # Other
    iterated_model_train,
    iterated_model_predict,
    iterated_model_name,
    iterated_model_index,
    optimization_index,
    optimization_value,
    model_train_input,
    model_predict_input,
    model_test_inputs,
    models_test_outputs,
    models_test_outputs_unstandardized,
    data_abs_max,
    data_mean,
    data_std,
    last_undiff_value=None,
    final_scaler=None,
    pipe=None,
    semaphor=None,
) -> None | dict[str, Any]:
    """Inner function, that can run in parallel with multiprocessing.

    Note:
        config is just a dictionary passed as param, so cannot use dot syntax here.

    Args:
        Some values from predictit configuration.

    Returns:
        None | dict[str, Any]: Return dict of results or send data via multiprocessing.
    """

    logs_list = []
    warnings_list = []

    if config["multiprocessing"]:
        mylogging._misc.filter_warnings()
        mylogging.outer_warnings_filter(config["ignored_warnings"],
                                        config["ignored_warnings_class_type"])
        mylogging.config.BLACKLIST = config["ignored_warnings"]
        mylogging.config.OUTPUT = config["logger_output"]
        mylogging.config.LEVEL = config["logger_level"]
        mylogging.config.FILTER = config["logger_filter"]
        mylogging.config.COLORIZE = config["logger_color"]
        logs_redirect = mylogging.redirect_logs_and_warnings_to_lists(
            logs_list, warnings_list)

    if config["is_tested"]:
        import mypythontools

        mypythontools.tests.setup_tests(matplotlib_test_backend=True)

    if semaphor:
        semaphor.acquire()

    if config["trace_processes_memory"]:
        import tracemalloc

        tracemalloc.start()

    model_results = {"Name": iterated_model_name}

    result_name = (f"{iterated_model_name} - {optimization_value}"
                   if config["optimization"] else f"{iterated_model_name}")

    if (config["optimizeit"] and optimization_index == 0
            and iterated_model_name in config["models_parameters_limits"]):

        start_optimization = time.time()

        try:
            model_results[
                "Best optimized parameters"] = predictit.best_params.optimize(
                    iterated_model_train,
                    iterated_model_predict,
                    config["models_parameters"].get(iterated_model_name),
                    config["models_parameters_limits"][iterated_model_name],
                    model_train_input=model_train_input,
                    model_test_inputs=model_test_inputs,
                    models_test_outputs=models_test_outputs,
                    time_limit=config["optimizeit_limit"],
                    error_criterion=config["error_criterion"],
                    name=iterated_model_name,
                    iterations=config["iterations"],
                    fragments=config["fragments"],
                    details=config["optimizeit_details"],
                    plot=config["optimizeit_plot"],
                )

        except TimeoutError:
            model_results["Best optimized parameters"] = {}
            mylogging.traceback(
                f"Hyperparameters optimization of {iterated_model_name} didn't finished"
            )

        for k, l in model_results["Best optimized parameters"].items():

            if iterated_model_name not in config["models_parameters"]:
                config["models_parameters"][iterated_model_name] = {}
            config["models_parameters"][iterated_model_name][k] = l

        stop_optimization = time.time()
        model_results[
            "Hyperparameter optimization time"] = stop_optimization - start_optimization

    start = time.time()

    try:

        # If no parameters or parameters details, add it so no index errors later
        if iterated_model_name not in config["models_parameters"]:
            config["models_parameters"][iterated_model_name] = {}

        # Train all models
        trained_model = iterated_model_train(
            model_train_input,
            **config["models_parameters"][iterated_model_name])

        # Create predictions - out of sample
        one_reality_result = iterated_model_predict(model_predict_input,
                                                    trained_model,
                                                    config["predicts"])

        if np.isnan(np.sum(one_reality_result)) or one_reality_result is None:
            raise ValueError("NaN predicted from model.")

        # Remove wrong values out of scope to not be plotted
        one_reality_result[abs(one_reality_result) > 3 * data_abs_max] = np.nan

        # Do inverse data preprocessing
        if config["power_transformed"]:
            one_reality_result = fitted_power_transform(
                one_reality_result, data_std, data_mean)

        one_reality_result = preprocess_data_inverse(
            one_reality_result,
            final_scaler=final_scaler,
            last_undiff_value=last_undiff_value,
            standardizeit=config["standardizeit"],
            data_transform=config["data_transform"],
        )

        tests_results = np.zeros((config["repeatit"], config["predicts"]))
        test_errors_unstandardized = np.zeros(
            (config["repeatit"], config["predicts"]))
        test_errors = np.zeros(config["repeatit"])

        # Predict many values in test inputs to evaluate which models are best - do not inverse data preprocessing,
        # because test data are processed
        for repeat_iteration in range(config["repeatit"]):

            # Create in-sample predictions to evaluate if model is good or not
            tests_results[repeat_iteration] = iterated_model_predict(
                model_test_inputs[repeat_iteration],
                trained_model,
                predicts=config["predicts"],
            )

            if config["power_transformed"]:
                tests_results[repeat_iteration] = fitted_power_transform(
                    tests_results[repeat_iteration], data_std, data_mean)

            test_errors[
                repeat_iteration] = predictit.evaluate_predictions.compare_predicted_to_test(
                    tests_results[repeat_iteration],
                    models_test_outputs[repeat_iteration],
                    error_criterion=config["error_criterion"],
                )

            tests_results[repeat_iteration] = preprocess_data_inverse(
                tests_results[repeat_iteration],
                final_scaler=final_scaler,
                last_undiff_value=last_undiff_value,
                standardizeit=config["standardizeit"],
                data_transform=config["data_transform"],
            )

            test_errors_unstandardized[
                repeat_iteration] = predictit.evaluate_predictions.compare_predicted_to_test(
                    tests_results[repeat_iteration],
                    models_test_outputs_unstandardized[repeat_iteration],
                    error_criterion=config["error_criterion"],
                )

        model_results["Model error"] = test_errors.mean()
        model_results["Unstandardized model error"] = test_errors.mean()
        model_results["Results"] = one_reality_result
        model_results["Test errors"] = test_errors

        # For example tensorflow is not pickleable, so sending model from process would fail.
        # Trained models only if not multiprocessing
        if not ["multiprocessing"]:
            model_results["Trained model"] = trained_model

    except (Exception, ):
        results_array = np.zeros(config["predicts"])
        results_array.fill(np.nan)
        test_errors = np.zeros((config["repeatit"], config["predicts"]))
        test_errors.fill(np.nan)

        model_results["Model error"] = np.inf
        model_results["Unstandardized model error"] = np.inf
        model_results["Results"] = results_array
        model_results["Test errors"] = test_errors
        error_message = (
            f"Error in '{result_name}' model"
            if not config["optimization"] else
            f"Error in {iterated_model_name} model with optimized value: {optimization_value}"
        )

        mylogging.traceback(caption=error_message)

    finally:
        model_results["Index"] = (optimization_index, iterated_model_index)
        model_results["warnings_list"] = warnings_list
        model_results["logs_list"] = logs_list
        model_results["Model time [s]"] = time.time() - start

        if config["optimization_variable"]:
            model_results["Optimization value"] = optimization_value

        if config["trace_processes_memory"]:
            _, memory_peak_MB = tracemalloc.get_traced_memory()
            model_results["Memory Peak\n[MB]"] = memory_peak_MB / 10**6
            tracemalloc.stop()

        if config["multiprocessing"]:
            logs_redirect.close_redirect()

        if semaphor:
            semaphor.release()

        if config["multiprocessing"] == "process":
            pipe.send({f"{result_name}": model_results})
            pipe.close()

        else:
            return {f"{result_name}": model_results}
Пример #13
0
def analyze_column(data: np.ndarray | pd.DataFrame,
                   lags: int = 5,
                   window: int = 5) -> None:
    """Function one-dimensional data (predicted column), that plot data, it's distribution, some details like minimum,
    maximum, std, mean etc. It also create autocorrelation and partial autocorrelation (good for ARIMA models) and
    plot rolling mean and rolling std. It also tell if data are probably stationary or not.

    Args:
        data (np.ndarray | pd.DataFrame): Time series data.
        lags (int, optional): Lags used for autocorrelation. Defaults to 5.
        window (int, optional): Window for rolling average and rolling std. Defaults to 5.

    """
    if not misc.GLOBAL_VARS.PLOTS_CONFIGURED:
        misc.setup_plots()

    import matplotlib.pyplot as plt
    import seaborn as sns
    from statsmodels.graphics.tsaplots import plot_acf
    from statsmodels.graphics.tsaplots import plot_pacf
    from statsmodels.tsa.stattools import adfuller

    import mydatapreprocessing

    data = np.array(data)

    if data.ndim != 1 and 1 not in data.shape:
        raise ValueError(
            mylogging.return_str(
                "Select column you want to analyze",
                caption="analyze_data function only for one-dimensional data!",
            ))

    data = data.ravel()

    print(
        f"Length: {len(data)}\n"
        f"Minimum: {np.nanmin(data)}\n"
        f"Maximum: {np.nanmax(data)}\n"
        f"Mean: {np.nanmean(data)}\n"
        f"Std: {np.nanstd(data)}\n"
        f"First few values: {data[-5:]}\n"
        f"Middle values: {data[int(-len(data)/2): int(-len(data)/2) + 5]}\n"
        f"Last few values: {data[-5:]}\n"
        f"Number of nan (not a number) values: {np.count_nonzero(np.isnan(data))}\n"
    )

    # Data and it's distribution

    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(data)
    plt.xlabel("t")
    plt.ylabel("f(x)")

    plt.subplot(1, 2, 2)
    sns.histplot(data, bins=100, kde=True, color="skyblue")
    plt.xlabel("f(x)")
    plt.ylabel("Distribution")

    plt.tight_layout()
    plt.suptitle("Data and it's distribution", fontsize=20)
    plt.subplots_adjust(top=0.88)
    plt.draw()

    fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(10, 5))
    fig.suptitle("Repeating patterns - autocorrelation")

    try:

        plot_acf(data, lags=lags, ax=ax)
        ax.set_xlabel("Lag")
        plot_pacf(data, lags=lags, ax=ax2)
        ax2.set_xlabel("Lag")
        plt.draw()

    except Exception:
        mylogging.traceback(
            "Error in analyze_column function - in autocorrelation function: Maybe more lags, than values"
        )

    # Moving average
    rolling_mean = np.sum(
        mydatapreprocessing.misc.rolling_windows(data, window), 1)
    rolling_std = np.std(
        mydatapreprocessing.misc.rolling_windows(data, window), 1)

    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(rolling_mean)
    plt.xlabel("t")
    plt.ylabel("Rolling average x")

    plt.subplot(1, 2, 2)
    plt.plot(rolling_std)
    plt.xlabel("f(x)")
    plt.ylabel("Rolling standard deviation x")

    plt.tight_layout()
    plt.suptitle("Rolling average and rolling standard deviation", fontsize=20)
    plt.subplots_adjust(top=0.88)
    plt.draw()

    # Dick Fuller test for stationarity
    pvalue = adfuller(data)[1]
    cutoff = 0.05
    if pvalue < cutoff:
        print(
            f"\np-value = {pvalue} : Analyzed column is probably stationary.\n"
        )
    else:
        print(
            f"\np-value = {pvalue} : Analyzed column is probably not stationary.\n"
        )
Пример #14
0
def test_warnings():

    mylogging.config.TO_FILE = False

    errors = []

    def get_stdout(func, args=[], kwargs={}, loop=1):

        stdout = sys.stdout
        sys.stdout = io.StringIO()

        for i in range(loop):
            func(*args, **kwargs)

        output = sys.stdout.getvalue()
        sys.stdout = stdout

        return output

    ################
    ### Debug = 0 - show not
    ################

    with warnings.catch_warnings(record=True) as w:
        mylogging.set_warnings(debug=0)

        if get_stdout(mylogging.info, ["Hello"]):
            errors.append("Info printed, but should not.")

        mylogging.warn("asdasd")

        try:
            print(10 / 0)

        except Exception:
            mylogging.traceback("Maybe try to use something different than 0")

        if w:
            errors.append("Warn, but should not.")

    ################
    ### Debug = 1 - show once
    ################

    with warnings.catch_warnings(record=True) as w:
        mylogging.set_warnings(debug=1)

        output = get_stdout(mylogging.info, ["Hello"], loop=2)

        if not output:
            errors.append("Info not printed, but should.")

        outuput_lines_count = len(output.splitlines())

        mylogging.warn("asdasd")
        mylogging.warn("dva")
        mylogging.warn("asdasd")

        try:
            print(10 / 0)

        except Exception:
            mylogging.traceback("Maybe try to use something different than 0")

        if len(w) != 3:
            errors.append("Doesn't warn once.")

    ################
    ### Debug = 2 - show always
    ################

    with warnings.catch_warnings(record=True) as w:
        mylogging.set_warnings(debug=2)

        mylogging.warn("asdasd")
        mylogging.warn("asdasd")

        if len(w) != 2:
            errors.append("Doesn'twarn always.")

        outuput_always = get_stdout(mylogging.info, ["Hello"], loop=2)
        outuput_lines_count_always = len(outuput_always.splitlines())

        if outuput_lines_count_always <= outuput_lines_count:
            errors.append("Info printed always if shouldn'r or vice versa.")

    ################
    ### Debug = 3 - Stop on error
    ################

    mylogging.set_warnings(debug=3)

    try:
        errors.append("Not stopped on runtime warning.")
        mylogging.warn("asdasd")
    except Exception:
        errors.pop()

    try:
        errors.append("Not stopped on traceback warning")

        try:
            print(10 / 0)

        except Exception:
            mylogging.traceback("Maybe try to use something different than 0")

    except Exception:
        errors.pop()

    # Test outer file
    with warnings.catch_warnings(record=True) as w:
        mylogging.set_warnings(1)

        if not get_stdout(info_outside, ["Message"]):
            errors.append("Outside info not working")

        warn_outside("Message")
        traceback_outside("Message")

        if len(w) != 2:
            errors.append("Warn from other file not working")

    assert not errors