Пример #1
0
def run_refinement_scoring_example(train_path, test_path, with_tuning=False):
    """ Function launch example with error modeling for classification task

    :param train_path: path to the csv file with training sample
    :param test_path: path to the csv file with test sample
    :param with_tuning: is it need to tune pipelines or not
    """

    task = Task(TaskTypesEnum.classification)
    train_dataset = InputData.from_csv(train_path, task=task)
    test_dataset = InputData.from_csv(test_path, task=task)

    # Get and fit pipelines
    no_decompose_c = get_non_refinement_pipeline()
    decompose_c = get_refinement_pipeline()

    no_decompose_c.fit(train_dataset)
    decompose_c.fit(train_dataset)

    # Check metrics for both pipelines
    display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline')
    display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline')

    if with_tuning:
        no_decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                           input_data=train_dataset, iterations=30)

        decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                        input_data=train_dataset, iterations=30)

        display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline after tuning')
        display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline after tuning')
Пример #2
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
Пример #3
0
def test_target_data_from_csv_correct():
    """ Function tests two ways of processing target columns in "from_csv"
    method
    """
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/multi_target_sample.csv'
    path = os.path.join(test_file_path, file)
    task = Task(TaskTypesEnum.regression)

    # Process one column
    target_column = '1_day'
    one_column_data = InputData.from_csv(path,
                                         target_columns=target_column,
                                         columns_to_drop=['date'],
                                         task=task)

    # Process multiple target columns
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]
    seven_columns_data = InputData.from_csv(path,
                                            target_columns=target_columns,
                                            columns_to_drop=['date'],
                                            task=task)

    assert one_column_data.target.shape == (499, 1)
    assert seven_columns_data.target.shape == (499, 7)
Пример #4
0
def test_with_custom_target():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/simple_classification.csv'
    file_custom = '../../data/simple_classification_with_custom_target.csv'

    file_data = InputData.from_csv(os.path.join(test_file_path, file))

    expected_features = file_data.features
    expected_target = file_data.target

    custom_file_data = InputData.from_csv(os.path.join(test_file_path,
                                                       file_custom),
                                          delimiter=';')
    actual_features = custom_file_data.features
    actual_target = custom_file_data.target

    assert not np.array_equal(expected_features, actual_features)
    assert not np.array_equal(expected_target, actual_target)

    custom_file_data = InputData.from_csv(os.path.join(test_file_path,
                                                       file_custom),
                                          delimiter=';',
                                          columns_to_drop=['redundant'],
                                          target_columns='custom_target')

    actual_features = custom_file_data.features
    actual_target = custom_file_data.target

    assert np.array_equal(expected_features, actual_features)
    assert np.array_equal(expected_target, actual_target)
Пример #5
0
def run_chain_from_automl(train_file_path: str, test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')

    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Пример #6
0
def get_case_train_test_data():
    """ Function for getting data for train and validation """
    train_file_path, test_file_path = get_scoring_case_data_paths()

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)
    return train_data, test_data
Пример #7
0
def run_autokeras(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task == TaskTypesEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    return test_data.target, predicted
Пример #8
0
def run_h2o(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data, round(max_runtime_secs / 60))
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predicted = predict_h2o(imported_model, test_frame)

    h2o.shutdown(prompt=False)

    return true_target, predicted
Пример #9
0
def run_pipeline_from_automl(train_file_path: str,
                             test_file_path: str,
                             max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run pipeline with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for pipeline
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    OperationTypesRepository.assign_repo('model', 'automl_repository.json')
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Пример #10
0
def run_xgboost(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    if task == TaskTypesEnum.classification:
        model = xgb.XGBClassifier(max_depth=2,
                                  learning_rate=1.0,
                                  objective='binary:logistic')
        model.fit(train_data.features, train_data.target)
        predicted = model.predict_proba(test_data.features)[:, 1]
        predicted_labels = model.predict(test_data.features)

    elif task == TaskTypesEnum.regression:
        xgbr = xgb.XGBRegressor(max_depth=3,
                                learning_rate=0.3,
                                n_estimators=300,
                                objective='reg:squarederror')
        xgbr.fit(train_data.features, train_data.target)
        predicted = xgbr.predict(test_data.features)
        predicted_labels = None

    else:
        raise NotImplementedError()
    return test_data.target, predicted, predicted_labels
Пример #11
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run chain with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for chain
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Пример #12
0
def get_scoring_data() -> Tuple[InputData, InputData]:
    train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv'
    test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv'

    train_data = InputData.from_csv(train_data_path)
    test_data = InputData.from_csv(test_data_path)

    return train_data, test_data
Пример #13
0
def get_scoring_data():
    file_path_train = 'cases/data/scoring/scoring_train.csv'
    full_path_train = join(str(project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = 'cases/data/scoring/scoring_test.csv'
    full_path_test = join(str(project_root()), file_path_test)
    task = Task(TaskTypesEnum.classification)
    train = InputData.from_csv(full_path_train, task=task)
    test = InputData.from_csv(full_path_test, task=task)

    return train, test
Пример #14
0
def file_data_setup():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/simple_classification.csv'
    input_data = InputData.from_csv(
        os.path.join(test_file_path, file))
    input_data.idx = _to_numerical(categorical_ids=input_data.idx)
    return input_data
Пример #15
0
def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path,
                                                         return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, target_column=None)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df
Пример #16
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type, tags=['simple'])

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Пример #17
0
def run_multi_output_case(path, vis=False):
    """ Function launch case for river levels prediction on Lena river as
    multi-output regression task

    :param path: path to the file with table
    :param vis: is it needed to visualise pipeline and predictions
    """
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]

    data = InputData.from_csv(path,
                              target_columns=target_columns,
                              columns_to_drop=['date'])
    train, test = train_test_data_setup(data)

    problem = 'regression'

    automl_model = Fedot(problem=problem)
    automl_model.fit(features=train)
    predicted_array = automl_model.predict(features=test)

    # Convert output into one dimensional array
    forecast = np.ravel(predicted_array)

    mae_value = mean_absolute_error(np.ravel(test.target), forecast)
    print(f'MAE - {mae_value:.2f}')

    if vis:
        plot_predictions(predicted_array, test)
Пример #18
0
def get_kc2_data():
    file_path = 'cases/data/kc2/kc2.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.classification)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Пример #19
0
def create_fitted_chain() -> Chain:
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    chain = create_chain()
    chain.fit(train_data)

    return chain
Пример #20
0
def get_cholesterol_data():
    file_path = 'cases/data/cholesterol/cholesterol.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Пример #21
0
def run_tpot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    models_hyperparameters = get_models_hyperparameters()['TPOT']
    generations = models_hyperparameters['GENERATIONS']
    population_size = models_hyperparameters['POPULATION_SIZE']

    result_model_filename = f'{case_label}_g{generations}' \
                            f'_p{population_size}_{task.name}.pkl'
    current_file_path = str(os.path.dirname(__file__))
    result_file_path = os.path.join(current_file_path, result_model_filename)

    train_data = InputData.from_csv(train_file_path, task=Task(task))

    if result_model_filename not in os.listdir(current_file_path):
        # TODO change hyperparameters to actual from variable
        model = fit_tpot(train_data,
                         models_hyperparameters['MAX_RUNTIME_MINS'])

        model.export(
            output_file_name=f'{result_model_filename[:-4]}_pipeline.py')

        # sklearn pipeline object
        fitted_model_config = model.fitted_pipeline_
        joblib.dump(fitted_model_config, result_file_path, compress=1)

    imported_model = joblib.load(result_file_path)

    predict_data = InputData.from_csv(test_file_path, task=Task(task))
    true_target = predict_data.target
    if task == TaskTypesEnum.regression:
        predicted = predict_tpot_reg(imported_model, predict_data)
        predicted_labels = predicted
    elif task == TaskTypesEnum.classification:
        predicted, predicted_labels = predict_tpot_class(
            imported_model, predict_data)
    else:
        print('Incorrect type of ml task')
        raise NotImplementedError()

    print(f'BEST_model: {imported_model}')

    return true_target, predicted, predicted_labels
Пример #22
0
def create_fitted_pipeline() -> Pipeline:
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    pipeline = create_pipeline()
    pipeline.fit(train_data)

    return pipeline
Пример #23
0
def test_string_features_from_csv():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/classification_with_categorical.csv'
    expected_features = InputData.from_csv(os.path.join(test_file_path,
                                                        file)).features

    assert expected_features.dtype == float
    assert np.isfinite(expected_features).all()
Пример #24
0
def test_fitted_pipeline_cache_correctness_after_export_and_import():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    pipeline = create_classification_pipeline_with_preprocessing()
    pipeline.fit(train_data)
    pipeline.save('test_fitted_pipeline_cache_correctness_after_export_and_import')
    prediction = pipeline.predict(test_data)

    new_pipeline = Pipeline()
    new_pipeline.load(create_correct_path('test_fitted_pipeline_cache_correctness_after_export_and_import'))

    new_prediction = new_pipeline.predict(test_data)

    assert np.array_equal(prediction.predict, new_prediction.predict)
    assert new_pipeline.is_fitted
def run_credit_scoring_problem(train_file_path, test_file_path,
                               max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
                               gp_optimiser_params: Optional[GPChainOptimiserParameters] = None, pop_size=None,
                               generations=None):
    dataset_to_compose = InputData.from_csv(train_file_path)
    dataset_to_validate = InputData.from_csv(test_file_path)

    available_model_types, _ = ModelTypesRepository(). \
        suitable_model(task_type=TaskTypesEnum.classification)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC)

    if gp_optimiser_params:
        optimiser_parameters = gp_optimiser_params
    else:
        selection_types = [SelectionTypesEnum.tournament]
        crossover_types = [CrossoverTypesEnum.subtree]
        mutation_types = [MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce]
        regularization_type = RegularizationTypesEnum.decremental
        optimiser_parameters = GPChainOptimiserParameters(selection_types=selection_types,
                                                          crossover_types=crossover_types,
                                                          mutation_types=mutation_types,
                                                          regularization_type=regularization_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types, max_arity=4,
        max_depth=3, pop_size=pop_size, num_of_generations=generations,
        crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time)

    # Create GP-based composer
    composer = GPComposer()

    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                initial_chain=None,
                                                composer_requirements=composer_requirements,
                                                metrics=metric_function, optimiser_parameters=optimiser_parameters,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    roc_on_valid_evo_composed = calculate_validation_metric(chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed, chain_evo_composed, composer
Пример #26
0
def validate_model_quality(model: Pipeline, data_path: str):
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict

    roc_auc_valid = round(roc_auc(y_true=test_data.target,
                                  y_score=predicted_labels,
                                  multi_class='ovo',
                                  average='macro'), 3)
    return roc_auc_valid
Пример #27
0
def test_evaluate_individuals():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    pipelines_to_evaluate = [
        pipeline_first(),
        pipeline_second(),
        pipeline_third(),
        pipeline_fourth()
    ]

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ratio_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    adapter = PipelineAdapter()
    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=0.001)
    params = GraphGenerationParams(adapter=PipelineAdapter(),
                                   advisor=PipelineChangeAdvisor())
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=5)
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
Пример #28
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()),
        RandomForestClassifier()
    )
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results)

    print(roc_auc_value)

    chain = Chain()
    node_first = PrimaryNode('direct_data_model')
    node_second = PrimaryNode('bernb')
    node_third = SecondaryNode('rf')

    node_third.nodes_from.append(node_first)
    node_third.nodes_from.append(node_second)

    chain.add_node(node_third)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Пример #29
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=32,
                                     is_visualise=False):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    chain_simple = TsForecastingChain(PrimaryNode('linear'))
    chain_simple.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_simple = calculate_validation_metric(
        chain_simple.predict(dataset_to_validate),
        dataset_to_validate,
        f'full-simple_{forecast_length}',
        is_visualise=is_visualise)
    print(f'RMSE simple: {rmse_on_valid_simple}')

    chain_composite_lstm = get_composite_chain()
    chain_composite_lstm.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_lstm_only = calculate_validation_metric(
        chain_composite_lstm.predict(dataset_to_validate),
        dataset_to_validate,
        f'full-lstm-only_{forecast_length}',
        is_visualise=is_visualise)
    print(f'RMSE LSTM composite: {rmse_on_valid_lstm_only}')

    return rmse_on_valid_simple
Пример #30
0
def test_data_model_type_classification_chain_fit():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    chain = create_classification_chain_with_preprocessing()
    chain.fit(train_data)
    chain.save('data_model_classification')

    expected_len_nodes = len(chain.nodes)
    actual_len_nodes = len(ChainTemplate(chain).operation_templates)

    assert actual_len_nodes == expected_len_nodes