예제 #1
0
def get_split_data_paths():
    file_path_train = 'test/data/simple_regression_train.csv'
    file_path_test = 'test/data/simple_regression_test.csv'
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)
    full_path_test = os.path.join(str(fedot_project_root()), file_path_test)

    return full_path_train, full_path_test
예제 #2
0
def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True):
    path = os.path.join(str(fedot_project_root()), files_path)

    unpack_archived_data(path)

    data = InputData.from_json_files(path, fields_to_use=['votes', 'year'],
                                     label='rating', task=task)

    class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target])
    data.target = class_labels

    ratio = 0.5

    img_files_path = f'{files_path}/*.jpeg'
    img_path = os.path.join(str(fedot_project_root()), img_files_path)

    data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size)

    data_text = InputData.from_json_files(path, fields_to_use=['plot'],
                                          label='rating', task=task,
                                          data_type=DataTypesEnum.text)
    data_text.target = class_labels

    if with_split:
        train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio)
        train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio)
        train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio)
    else:
        train_num, test_num = data, data
        train_img, test_img = data_img, data_img
        train_text, test_text = data_text, data_text

    return train_num, test_num, train_img, test_img, train_text, test_text
예제 #3
0
def get_scoring_case_data_paths() -> Tuple[str, str]:
    train_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_train.csv')
    test_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_test.csv')
    full_train_file_path = os.path.join(str(fedot_project_root()), train_file_path)
    full_test_file_path = os.path.join(str(fedot_project_root()), test_file_path)

    return full_train_file_path, full_test_file_path
예제 #4
0
def test_multivariate_ts():
    forecast_length = 1

    file_path_train = 'cases/data/metocean/metocean_data_train.csv'
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = 'cases/data/metocean/metocean_data_test.csv'
    full_path_test = os.path.join(str(fedot_project_root()), file_path_test)

    target_history, add_history, obs = prepare_input_data(
        full_path_train, full_path_test)

    historical_data = {
        'ws': add_history,  # additional variable
        'ssh': target_history,  # target variable
    }

    fedot = Fedot(
        problem='ts_forecasting',
        composer_params=composer_params,
        task_params=TsForecastingParams(forecast_length=forecast_length))
    fedot.fit(features=historical_data, target=target_history)
    forecast = fedot.forecast(historical_data, forecast_length=forecast_length)
    assert forecast is not None
예제 #5
0
def test_evaluate_individuals():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    pipelines_to_evaluate = [
        pipeline_first(),
        pipeline_second(),
        pipeline_third(),
        pipeline_fourth()
    ]

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ratio_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    adapter = PipelineAdapter()
    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=0.001)
    params = GraphGenerationParams(adapter=PipelineAdapter(),
                                   advisor=PipelineChangeAdvisor())
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=5)
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
예제 #6
0
def test_credit_scoring_problem():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    file_path_test = file_path_train
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)
    full_path_test = os.path.join(str(fedot_project_root()), file_path_test)

    roc_auc_test = run_credit_scoring_problem(full_path_train,
                                              full_path_test,
                                              timeout=timedelta(minutes=0.1))
    assert roc_auc_test > 0.5
예제 #7
0
def get_scoring_data():
    file_path_train = join('cases', 'data', 'scoring', 'scoring_train.csv')
    full_path_train = join(str(fedot_project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = join('cases', 'data', 'scoring', 'scoring_test.csv')
    full_path_test = join(str(fedot_project_root()), file_path_test)
    task = Task(TaskTypesEnum.classification)
    train = InputData.from_csv(full_path_train, task=task)
    test = InputData.from_csv(full_path_test, task=task)

    return train, test
예제 #8
0
def get_scoring_data():
    # the dataset was obtained from https://www.kaggle.com/c/GiveMeSomeCredit

    # a dataset that will be used as a train and test set during composition

    file_path_train = 'cases/data/scoring/scoring_train.csv'
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = 'cases/data/scoring/scoring_test.csv'
    full_path_test = os.path.join(str(fedot_project_root()), file_path_test)

    return full_path_train, full_path_test
예제 #9
0
def test_metocean_forecasting_problem():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_time_series.csv')
    file_path_test = file_path_train
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)
    full_path_test = os.path.join(str(fedot_project_root()), file_path_test)

    rmse = run_metocean_forecasting_problem(full_path_train,
                                            full_path_test,
                                            forecast_length=2,
                                            timeout=0.1)
    print(rmse)
    assert rmse['rmse'] < 500
예제 #10
0
def run_gapfilling_case(file_path):
    """
    The function runs an example of filling in gaps in a time series with
    air temperature. Real data case.

    :param file_path: path to the file
    :return: pandas dataframe with columns 'date','with_gap','ridge',
    'composite','temperature'
    """

    # Load dataframe
    full_path = os.path.join(str(fedot_project_root()), file_path)
    dataframe = pd.read_csv(full_path)
    dataframe['date'] = pd.to_datetime(dataframe['date'])

    # Filling in gaps based on inverted ridge regression model
    ridge_pipeline = get_simple_pipeline()
    ridge_gapfiller = ModelGapFiller(gap_value=-100.0,
                                     pipeline=ridge_pipeline)
    with_gap_array = np.array(dataframe['with_gap'])
    without_gap_arr_ridge = ridge_gapfiller.forward_inverse_filling(with_gap_array)
    dataframe['ridge'] = without_gap_arr_ridge

    # Filling in gaps based on a pipeline of 5 models
    composite_pipeline = get_composite_pipeline()
    composite_gapfiller = ModelGapFiller(gap_value=-100.0,
                                         pipeline=composite_pipeline)
    without_gap_composite = composite_gapfiller.forward_filling(with_gap_array)
    dataframe['composite'] = without_gap_composite
    return dataframe
예제 #11
0
def test_tpot_vs_fedot_example():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv')
    file_path_test = file_path_train

    auc = run_tpot_vs_fedot_example(file_path_train, file_path_test)
    assert auc > 0.5
예제 #12
0
def test_lagged_with_invalid_params_fit_correctly():
    """ The function define a pipeline with incorrect parameters in the lagged
    transformation. During the training of the pipeline, the parameter 'window_size'
    is corrected
    """
    window_size = 600
    len_forecast = 50

    # The length of the time series is 500 elements
    project_root_path = str(fedot_project_root())
    file_path = os.path.join(project_root_path, 'test/data/short_time_series.csv')
    df = pd.read_csv(file_path)
    time_series = np.array(df['sea_height'])

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(time_series)),
                            features=time_series,
                            target=time_series,
                            task=task,
                            data_type=DataTypesEnum.ts)

    # Get pipeline with lagged transformation in it
    pipeline = get_ts_pipeline(window_size)

    # Fit it
    pipeline.fit(train_input)

    is_pipeline_was_fitted = True
    assert is_pipeline_was_fitted
예제 #13
0
def get_cholesterol_data():
    file_path = join('cases', 'data', 'cholesterol', 'cholesterol.csv')
    full_path = join(str(fedot_project_root()), file_path)
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
예제 #14
0
def get_kc2_data():
    file_path = join('cases', 'data', 'kc2', 'kc2.csv')
    full_path = join(str(fedot_project_root()), file_path)
    task = Task(TaskTypesEnum.classification)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
예제 #15
0
def test_spam_detection_problem():
    """ Simple launch of spam detection case """
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/spam_detection.csv')

    # Classification task based on text data
    run_text_problem_from_saved_meta_file(file_path_train)
예제 #16
0
def test_pipeline_from_automl_example():
    project_root_path = str(fedot_project_root())
    with OperationTypesRepository().assign_repo('model', 'model_repository_with_automl.json') as _:
        file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv')
        file_path_test = file_path_train

        auc = run_pipeline_from_automl(file_path_train, file_path_test, max_run_time=timedelta(seconds=1))
    OperationTypesRepository.assign_repo('model', 'model_repository.json')

    assert auc > 0.5
예제 #17
0
def prepare_input_data(train_file_path, test_file_path):
    """ Function for preparing InputData for train and test algorithm

    :param train_file_path: path to the csv file for training
    :param test_file_path: path to the csv file for validation

    :return dataset_to_train: InputData for train
    :return dataset_to_validate: InputData for validation
    """
    # Load train and test dataframes
    full_path_train = os.path.join(str(fedot_project_root()), train_file_path)
    full_path_test = os.path.join(str(fedot_project_root()), test_file_path)
    df_train = pd.read_csv(full_path_train)
    df_test = pd.read_csv(full_path_test)

    ws_history = np.ravel(np.array(df_train['wind_speed']))
    ssh_history = np.ravel(np.array(df_train['sea_height']))
    ssh_obs = np.ravel(np.array(df_test['sea_height']))

    return ssh_history, ws_history, ssh_obs
예제 #18
0
def test_multi_modal_pipeline():
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    files_path = os.path.join('test', 'data', 'multi_modal')
    path = os.path.join(str(fedot_project_root()), files_path)

    train_num, _, train_img, _, train_text, _ = \
        prepare_multi_modal_data(path, task, images_size, with_split=False)

    # image
    image_node = PrimaryNode('cnn')
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 1,
                                'batch_size': 128}

    # image
    ds_image = PrimaryNode('data_source_img')
    image_node = SecondaryNode('cnn', nodes_from=[ds_image])
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 15,
                                'batch_size': 128}

    # table
    ds_table = PrimaryNode('data_source_table')
    scaling_node = SecondaryNode('scaling', nodes_from=[ds_table])
    numeric_node = SecondaryNode('rf', nodes_from=[scaling_node])

    # text
    ds_text = PrimaryNode('data_source_text')
    node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text])
    text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean])

    pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node]))

    fit_data = MultiModalData({
        'data_source_img': train_img,
        'data_source_table': train_num,
        'data_source_text': train_text
    })

    pipeline.fit(fit_data)
    prediction = pipeline.predict(fit_data)

    assert prediction is not None
예제 #19
0
def test_data_from_json():
    # several features
    files_path = os.path.join('test', 'data', 'multi_modal')
    path = os.path.join(str(fedot_project_root()), files_path)
    data = InputData.from_json_files(path,
                                     fields_to_use=['votes', 'year'],
                                     label='rating',
                                     task=Task(TaskTypesEnum.regression))
    assert data.features.shape[1] == 2  # check there is two features
    assert len(data.target) == data.features.shape[0] == len(data.idx)

    # single feature
    data = InputData.from_json_files(path,
                                     fields_to_use=['votes'],
                                     label='rating',
                                     task=Task(TaskTypesEnum.regression))
    assert len(data.features.shape) == 1  # check there is one feature
    assert len(data.target) == len(data.features) == len(data.idx)
예제 #20
0
def run_custom_example(
        timeout: datetime.timedelta = datetime.timedelta(minutes=0.2)):
    data = pd.read_csv(
        os.path.join(fedot_project_root(), 'examples', 'data',
                     'custom_encoded.csv'))
    nodes_types = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10']
    rules = [has_no_self_cycled_nodes, has_no_cycle, _has_no_duplicates]

    initial = CustomGraphModel(nodes=[
        CustomGraphNode(nodes_from=None, content=node_type)
        for node_type in nodes_types
    ])

    requirements = GPComposerRequirements(primary=nodes_types,
                                          secondary=nodes_types,
                                          max_arity=10,
                                          max_depth=10,
                                          pop_size=5,
                                          num_of_generations=5,
                                          crossover_prob=0.8,
                                          mutation_prob=0.9,
                                          timeout=timeout)

    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
        mutation_types=[custom_mutation],
        crossover_types=[CrossoverTypesEnum.none],
        regularization_type=RegularizationTypesEnum.none)

    graph_generation_params = GraphGenerationParams(adapter=DirectAdapter(
        base_graph_class=CustomGraphModel, base_node_class=CustomGraphNode),
                                                    rules_for_constraint=rules)

    optimizer = GPGraphOptimiser(
        graph_generation_params=graph_generation_params,
        metrics=[],
        parameters=optimiser_parameters,
        requirements=requirements,
        initial_graph=initial,
        log=default_log(logger_name='Bayesian', verbose_level=1))

    optimized_network = optimizer.optimise(partial(custom_metric, data=data))

    optimized_network.show()
예제 #21
0
def get_ts_data(n_steps=80, forecast_length=5):
    """ Prepare data from csv file with time series and take needed number of
    elements

    :param n_steps: number of elements in time series to take
    :param forecast_length: the length of forecast
    """
    project_root_path = str(fedot_project_root())
    file_path = os.path.join(project_root_path, 'test/data/simple_time_series.csv')
    df = pd.read_csv(file_path)

    time_series = np.array(df['sea_height'])[:n_steps]
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=forecast_length))

    data = InputData(idx=np.arange(0, len(time_series)),
                     features=time_series,
                     target=time_series,
                     task=task,
                     data_type=DataTypesEnum.ts)
    return train_test_data_setup(data)
예제 #22
0
def create_multi_clf_examples_from_excel(file_path: str, return_df: bool = False):
    """ Return dataframe from excel file or path to the csv file """
    df = pd.read_excel(file_path, engine='openpyxl')
    train, test = split_data(df)
    file_dir_name = file_path.replace('.', '/').split('/')[-2]
    file_csv_name = f'{file_dir_name}.csv'
    directory_names = ['examples', 'data', file_dir_name]

    # Check does obtained directory exist or not
    ensure_directory_exists(directory_names)
    if return_df:
        # Need to return dataframe and path to the file in csv format
        path = os.path.join(directory_names[0], directory_names[1], directory_names[2], file_csv_name)
        full_file_path = os.path.join(str(fedot_project_root()), path)
        save_file_to_csv(df, full_file_path)
        return df, full_file_path
    else:
        # Need to return only paths to the files with train and test data
        full_train_file_path, full_test_file_path = get_split_data_paths(directory_names)
        save_file_to_csv(train, full_train_file_path)
        save_file_to_csv(train, full_test_file_path)
        return full_train_file_path, full_test_file_path
예제 #23
0
def test_river_levels_problem():
    # Initialise pipeline for river levels prediction
    node_encoder = PrimaryNode('one_hot_encoding')
    node_scaling = SecondaryNode('scaling', nodes_from=[node_encoder])
    node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling])
    node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling])
    node_final = SecondaryNode('rfr', nodes_from=[node_ridge, node_lasso])

    init_pipeline = Pipeline(node_final)

    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/station_levels.csv')

    run_river_experiment(file_path=file_path_train,
                         pipeline=init_pipeline,
                         iterations=1,
                         tuner=PipelineTuner,
                         tuner_iterations=10)

    is_experiment_finished = True

    assert is_experiment_finished
예제 #24
0
def test_fedot_project_root():
    root_path = fedot_project_root()
    assert 'core' in os.listdir(os.path.join(root_path, 'fedot'))
    assert 'api' in os.listdir(os.path.join(root_path, 'fedot'))