def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model composer = GPComposer() # run the search of best suitable model chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def test_dummy_composer_flat_chain_build_correct(): composer = DummyComposer(DummyChainTypeEnum.flat) empty_data = InputData(idx=np.zeros(1), features=np.zeros(1), target=np.zeros(1), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) primary = ['logit'] secondary = ['logit', 'xgboost'] composer_requirements = ComposerRequirements(primary=primary, secondary=secondary) new_chain = composer.compose_chain( data=empty_data, initial_chain=None, composer_requirements=composer_requirements, metrics=None) assert len(new_chain.nodes) == 3 assert isinstance(new_chain.nodes[0], PrimaryNode) assert isinstance(new_chain.nodes[1], SecondaryNode) assert isinstance(new_chain.nodes[2], SecondaryNode) assert new_chain.nodes[1].nodes_from[0] is new_chain.nodes[0] assert new_chain.nodes[2].nodes_from[0] is new_chain.nodes[1] assert new_chain.nodes[0].nodes_from is None
def data_setup(): predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] train_data_x, test_data_x = split_train_test(predictors) train_data_y, test_data_y = split_train_test(response) train_data = InputData(features=train_data_x, target=train_data_y, idx=np.arange(0, len(train_data_y)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) test_data = InputData(features=test_data_x, target=test_data_y, idx=np.arange(0, len(test_data_y)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return train_data, test_data
def chain_with_incorrect_task_type(): first = PrimaryNode(model_type='linear') second = PrimaryNode(model_type='linear') final = SecondaryNode(model_type='kmeans', nodes_from=[first, second]) chain = Chain(final) return chain, Task(TaskTypesEnum.classification)
def get_iris_data() -> InputData: synthetic_data = load_iris() input_data = InputData(idx=np.arange(0, len(synthetic_data.target)), features=synthetic_data.data, target=synthetic_data.target, task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def run_tpot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task models_hyperparameters = get_models_hyperparameters()['TPOT'] generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] result_model_filename = f'{case_label}_g{generations}' \ f'_p{population_size}_{task.name}.pkl' current_file_path = str(os.path.dirname(__file__)) result_file_path = os.path.join(current_file_path, result_model_filename) train_data = InputData.from_csv(train_file_path, task=Task(task)) if result_model_filename not in os.listdir(current_file_path): # TODO change hyperparameters to actual from variable model = fit_tpot(train_data, models_hyperparameters['MAX_RUNTIME_MINS']) model.export( output_file_name=f'{result_model_filename[:-4]}_pipeline.py') # sklearn pipeline object fitted_model_config = model.fitted_pipeline_ joblib.dump(fitted_model_config, result_file_path, compress=1) imported_model = joblib.load(result_file_path) predict_data = InputData.from_csv(test_file_path, task=Task(task)) true_target = predict_data.target if task == TaskTypesEnum.regression: predicted = predict_tpot_reg(imported_model, predict_data) elif task == TaskTypesEnum.classification: predicted = predict_tpot_class(imported_model, predict_data) else: print('Incorrect type of ml task') raise NotImplementedError() print(f'BEST_model: {imported_model}') return true_target, predicted
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: ComposerVisualiser.visualise(chain_evo_composed) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def data_setup() -> InputData: predictors, response = load_iris(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) predictors = predictors[:100] response = response[:100] data = InputData(features=predictors, target=response, idx=np.arange(0, 100), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return data
def get_synthetic_input_data(n_samples=10000, n_features=10, random_state=None) -> InputData: synthetic_data = make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state) input_data = InputData(idx=np.arange(0, len(synthetic_data[1])), features=synthetic_data[0], target=synthetic_data[1], task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def classification_dataset_with_redunant_features(n_samples=1000, n_features=100, n_informative=5 ) -> InputData: synthetic_data = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative) input_data = InputData(idx=np.arange(0, len(synthetic_data[1])), features=synthetic_data[0], target=synthetic_data[1], task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def classification_dataset(): samples = 1000 x = 10.0 * np.random.rand(samples, ) - 5.0 x = np.expand_dims(x, axis=1) y = 1.0 / (1.0 + np.exp(np.power(x, -1.0))) threshold = 0.5 classes = np.array([0.0 if val <= threshold else 1.0 for val in y]) classes = np.expand_dims(classes, axis=1) data = InputData(features=x, target=classes, idx=np.arange(0, len(x)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return data
def get_synthetic_ts_data(n_steps=10000) -> InputData: simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=1, max_window_size=2)) input_data = InputData(idx=np.arange(0, n_steps), features=np.asarray([x1, x2]).T, target=simulated_data, task=task, data_type=DataTypesEnum.ts) return input_data
def output_dataset(): task = Task(TaskTypesEnum.classification) samples = 1000 x = 10.0 * np.random.rand(samples, ) - 5.0 x = np.expand_dims(x, axis=1) threshold = 0.5 y = 1.0 / (1.0 + np.exp(np.power(x, -1.0))) classes = np.array([0.0 if val <= threshold else 1.0 for val in y]) classes = np.expand_dims(classes, axis=1) data = OutputData(idx=np.arange(0, 100), features=x, predict=classes, task=task, data_type=DataTypesEnum.table) return data
def test_data_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/simple_classification.csv' task = Task(TaskTypesEnum.classification) df = pd.read_csv(os.path.join(test_file_path, file)) data_array = np.array(df).T features = data_array[1:-1].T target = data_array[-1] idx = data_array[0] expected_features = InputData( features=features, target=target, idx=idx, task=task, data_type=DataTypesEnum.table).features.all() actual_features = InputData.from_csv(os.path.join(test_file_path, file)).features.all() assert expected_features == actual_features
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False): templates_by_models = [] for model_template in itertools.chain.from_iterable(chain_template): model_instance = Model(model_type=model_template.model_type) model_template.model_instance = model_instance templates_by_models.append((model_template, model_instance)) if skip_fit: return for template, instance in templates_by_models: samples, features_amount = template.input_shape if with_gaussian: features, target = gauss_quantiles(samples_amount=samples, features_amount=features_amount, classes_amount=classes) else: options = { 'informative': features_amount, 'redundant': 0, 'repeated': 0, 'clusters_per_class': 1 } features, target = synthetic_dataset( samples_amount=samples, features_amount=features_amount, classes_amount=classes, features_options=options) target = np.expand_dims(target, axis=1) data_train = InputData(idx=np.arange(0, samples), features=features, target=target, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) preproc_data = copy(data_train) preprocessor = Normalization().fit(preproc_data.features) preproc_data.features = preprocessor.apply(preproc_data.features) print(f'Fit {instance}') fitted_model, predictions = instance.fit(data=preproc_data) template.fitted_model = fitted_model template.data_fit = preproc_data template.preprocessor = preprocessor
def from_csv(file_path, delimiter=',', task: Task = Task(TaskTypesEnum.classification), data_type: DataTypesEnum = DataTypesEnum.table, with_target=True): data_frame = pd.read_csv(file_path, sep=delimiter) data_frame = _convert_dtypes(data_frame=data_frame) data_array = np.array(data_frame).T idx = data_array[0] if with_target: features = data_array[1:-1].T target = data_array[-1].astype(np.float) else: features = data_array[1:].T target = None return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type)
def synthetic_benchmark_dataset(samples_amount: int, features_amount: int, classes_amount: int = 2, features_options: Dict = DEFAULT_OPTIONS, fitted_chain: Chain = None) -> InputData: """ Generates a binary classification benchmark dataset that was obtained using the (TODO: add. reference) proposed fitting schema. :param samples_amount: Total amount of samples in the resulted dataset. :param features_amount: Total amount of features per sample. :param classes_amount: The amount of classes in the dataset. :param features_options: features options in key-value suitable for classification_dataset. :param fitted_chain: Chain with separately fitted models. If None then 3-level balanced tree were fitted and taken as a default. :return: Benchmark dataset that is ready to be used by Chain. """ if fitted_chain is None: fitted_chain = _default_chain(samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount) if classes_amount != 2: raise NotImplementedError( 'Only binary classification tasks are supported') features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) task = Task(TaskTypesEnum.classification) samples_idxs = np.arange(0, samples_amount) train = InputData(idx=samples_idxs, features=features, target=target, task=task, data_type=DataTypesEnum.table) synth_target = fitted_chain.predict(input_data=train).predict synth_labels = _to_labels(synth_target) data_synth_train = InputData(idx=np.arange(0, samples_amount), features=features, target=synth_labels, task=task, data_type=DataTypesEnum.table) # TODO: fix preproc issues fitted_chain.fit_from_scratch(input_data=data_synth_train) features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) test = InputData(idx=samples_idxs, features=features, target=target, data_type=DataTypesEnum.table, task=task) synth_target = fitted_chain.predict(input_data=test).predict synth_labels = _to_labels(synth_target) data_synth_final = InputData(idx=samples_idxs, features=features, data_type=DataTypesEnum.table, target=synth_labels, task=task) return data_synth_final
def regression_dataset(): test_file_path = str(os.path.dirname(__file__)) file = 'data/advanced_regression.csv' data = InputData.from_csv(os.path.join(test_file_path, file)) data.task = Task(TaskTypesEnum.regression) return data
def run_oil_forecasting_problem(train_file_path, train_file_path_crm, forecast_length, max_window_size, is_visualise=False, well_id='Unknown'): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), train_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') full_path_train_crm = os.path.join(str(project_root()), train_file_path_crm) dataset_to_train_crm = InputData.from_csv(full_path_train_crm, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') dataset_to_validate_crm = copy(dataset_to_train_crm) prediction_full = None prediction_full_crm = None prediction_full_crm_opt = None forecast_window_shift_num = 4 depth = 100 for forecasting_step in range(forecast_window_shift_num): start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_train_local = dataset_to_train.subset(start, end) dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end) start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_validate_local = dataset_to_validate.subset( start + depth, end + depth) dataset_to_validate_local_crm = dataset_to_validate_crm.subset( start + depth, end + depth) chain_simple = Chain(PrimaryNode('lstm')) chain_simple_crm = Chain(PrimaryNode('lstm')) chain_crm_opt = get_comp_chain() chain_simple.fit_from_scratch(input_data=dataset_to_train_local, verbose=False) chain_simple_crm.fit_from_scratch( input_data=dataset_to_train_local_crm, verbose=False) chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm, verbose=False) prediction = chain_simple.predict(dataset_to_validate_local) prediction_crm = chain_simple_crm.predict( dataset_to_validate_local_crm) prediction_crm_opt = chain_crm_opt.predict( dataset_to_validate_local_crm) prediction_full = merge_datasets(prediction_full, prediction, forecasting_step) prediction_full_crm = merge_datasets(prediction_full_crm, prediction_crm, forecasting_step) prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt, prediction_crm_opt, forecasting_step) rmse_on_valid_simple = calculate_validation_metric( prediction_full, prediction_full_crm, prediction_full_crm_opt, dataset_to_validate, well_id, is_visualise) print(well_id) print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple
def run_fedot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task_type = params.task if task_type == TaskTypesEnum.classification: metric = ClassificationMetricsEnum.ROCAUC elif task_type == TaskTypesEnum.regression: metric = RegressionMetricsEnum.RMSE else: raise NotImplementedError() task = Task(task_type) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) models_hyperparameters = get_models_hyperparameters()['FEDOT'] cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS'] saved_model_name = f'fedot_{case_label}_{task_type}_{cur_lead_time}_{metric}' loaded_model = load_fedot_model(saved_model_name) if not loaded_model: generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] # the search of the models provided by the framework that can be used as nodes in a chain' models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model(task.task_type) metric_function = MetricsRepository().metric_by_id(metric) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=population_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=cur_lead_time)) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False) save_fedot_model(chain_evo_composed, saved_model_name) else: chain_evo_composed = loaded_model evo_predicted = chain_evo_composed.predict(dataset_to_validate) return dataset_to_validate.target, evo_predicted.predict
def classification_dataset(): test_file_path = str(os.path.dirname(__file__)) file = os.path.join('data', 'advanced_classification.csv') return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, with_visualisation=True): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) ref_chain = get_composite_lstm_chain() available_model_types_primary = ['trend_data_model', 'residual_data_model'] available_model_types_secondary = [ 'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model' ] composer = FixedStructureComposer() composer_requirements = GPComposerRequirements( primary=available_model_types_primary, secondary=available_model_types_secondary, max_arity=2, max_depth=4, pop_size=10, num_of_generations=10, crossover_prob=0, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=20)) chain = composer.compose_chain(data=dataset_to_train, initial_chain=ref_chain, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) if with_visualisation: ComposerVisualiser.visualise(chain) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise=with_visualisation) print(f'RMSE composite: {rmse_on_valid}') return rmse_on_valid
def synthetic_forecasting_problem(forecast_length: int, max_window_size: int): task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) ts_len = 10 ts = np.asarray(range(ts_len)) exog_variable = 10 + np.asarray(range(ts_len)).reshape(-1, 1) ts_data = InputData(idx=range(len(ts)), features=exog_variable, target=ts, task=task, data_type=DataTypesEnum.table) # shape is (5, 4, 1) exog_variable_3d = np.asarray([[[10], [11], [12], [13]], [[11], [12], [13], [14]], [[12], [13], [14], [15]], [[13], [14], [15], [16]], [[14], [15], [16], [17]]]) lagged_target_3d_as_feature = np.asarray([[[0], [1], [2], [3]], [[1], [2], [3], [4]], [[2], [3], [4], [5]], [[3], [4], [5], [6]], [[4], [5], [6], [7]]]) # now we concat exog lagged variables as well as target lagged # so features now is (5, 4, 2), i.e. # (n-max_window_size-forecast_length+1, max_window_size, amount_exog_features+target_shape) feature_3d = np.concatenate( (exog_variable_3d, lagged_target_3d_as_feature), axis=2) # target is (5, 4, 1) # (n-max_window_size-forecast_length+1, max_window_size, target_shape) # So lstm returns predictions with same shape # To get only forecast values do next: # pred_3d[:, -forecast_length:, :] # i.e. we take values only from last `forecast_length` timestamps target_3d = np.asarray([[[2], [3], [4], [5]], [[3], [4], [5], [6]], [[4], [5], [6], [7]], [[5], [6], [7], [8]], [[6], [7], [8], [9]]]) ts_data_3d = InputData(idx=range(len(ts)), features=feature_3d, target=target_3d, task=task, data_type=DataTypesEnum.ts_lagged_3d) # lagged format contains only the values to forecast (future values) in the target # this format convinient to use with classic regression modules # shape is (5, 2), i.e. (n-max_window_size-forecast_length+1, forecast_length * target_shape) target_lagged = np.asarray([[4, 5], [5, 6], [6, 7], [7, 8], [8, 9]]) # in lagged format all features are as feature_3d, but in 2d format # with shape (n-max_window_size-forecast_length+1, max_window_size * (amount_exog_features + target_shape)) features_lagged = feature_3d.reshape(feature_3d.shape[0], -1) ts_data_lagged = InputData(idx=range(len(ts)), features=features_lagged, target=target_lagged, task=task, data_type=DataTypesEnum.ts_lagged_table) return task, ts_len, ts_data, ts_data_3d, ts_data_lagged