def test_given_a_text_column_when_profiler_using_parallel_then_profiled_dataset_is_returned():
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when: using default method (joblib Parallel) for parallelisation
    actual_dataframe = apply_text_profiling(source_dataframe, "text")

    # then
    assert_frame_equal(expected_dataframe, actual_dataframe, check_like=True)
def test_given_a_text_column_when_profiler_is_applied_then_profiled_dataset_is_returned(
):
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when
    actual_dataframe = apply_text_profiling(source_dataframe, "text")

    # then
    assert_frame_equal(expected_dataframe, actual_dataframe, check_like=True)
def test_given_a_text_column_when_profiler_is_applied_without_granular_analysis_then_profiled_dataset_is_returned():
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe_no_granular.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when
    actual_dataframe = apply_text_profiling(
        source_dataframe, "text", {GRANULAR_OPTION: False}
    )

    # then
    assert_frame_equal(expected_dataframe, actual_dataframe, check_like=True)
def test_given_a_text_column_when_profiler_is_applied_without_high_level_analysis_then_profiled_dataset_is_returned():
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe_no_high_level.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when
    actual_dataframe = apply_text_profiling(
        source_dataframe, "text", {HIGH_LEVEL_OPTION: False, SPELLING_CHECK_OPTION: False}
    )

    # then
    assert_frame_equal(expected_dataframe, actual_dataframe, check_like=True)
def test_given_a_text_column_when_profiler_using_swifter_then_profiled_dataset_is_returned():
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when: using swifter method for parallelisation
    actual_dataframe = apply_text_profiling(
        source_dataframe, "text", params={PARALLELISATION_METHOD_OPTION: SWIFTER_METHOD}
    )

    # then
    assert_frame_equal(expected_dataframe, actual_dataframe, check_like=True)
def test_given_a_text_column_when_profiler_is_applied_with_then_all_options_disabled_then_no_profiled_dataset_is_returned():
    # given
    source_dataframe = create_source_dataframe()
    expected_dataframe = source_dataframe.copy()

    # when
    actual_dataframe = apply_text_profiling(
        source_dataframe, "text", {HIGH_LEVEL_OPTION: False, SPELLING_CHECK_OPTION: False, GRANULAR_OPTION: False}
    )

    # then
    assert_frame_equal(actual_dataframe,
                       expected_dataframe,
                       check_like=True)  # source dataframe is returned
def test_given_a_text_column_when_profiler_is_applied_grammar_check_analysis_then_profiled_dataset_is_returned(
):
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe_grammar_check.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when: in the interest of time, only perform grammar check
    # other tests are covering for high_level and granular functionality
    actual_dataframe = apply_text_profiling(source_dataframe, "text", {
        'high_level': False,
        'granular': False,
        'grammar_check': True
    })
    actual_dataframe.to_csv(csv_filename, index=False)

    # then
    assert_equal(expected_dataframe, actual_dataframe)
def test_given_a_text_column_when_profiler_is_applied_grammar_check_analysis_then_profiled_dataset_is_returned(
):
    # given
    source_dataframe = create_source_dataframe()
    csv_filename = f'{EXPECTED_DATA_PATH}/expected_profiled_dataframe_grammar_check.csv'
    expected_dataframe = pd.read_csv(csv_filename)

    # when: in the interest of time, only perform grammar check
    # other tests are covering for high_level and granular functionality
    actual_dataframe = apply_text_profiling(
        source_dataframe, "text", {
            HIGH_LEVEL_OPTION: False,
            SPELLING_CHECK_OPTION: False,
            GRANULAR_OPTION: False,
            GRAMMAR_CHECK_OPTION: True
        })

    # then
    assert_equal(expected_dataframe, actual_dataframe)