예제 #1
0
def test_load_xml_with_invalid_params(spark_session, schema, rowtag):
    """load_xml should return success for an invalid schema or rowtag."""
    result_df = load_xml(spark_session,
                         paths=['./tests/load/data/employee.xml'],
                         json_schema=create_json_schema(schema),
                         row_tag=rowtag)

    assert isinstance(result_df, DataFrame)
예제 #2
0
def test_load_json_invalid_path(spark_session, pytestconfig):
    """load_json should produces an FileNotFound exeption."""
    # Arrange
    invalid_file_paths = ['./file/that/do/not/exist.json']
    execution_dir = pytestconfig.rootdir

    # Act
    with pytest.raises(FileNotFoundError) as excinfo:
        load_json(spark_session, invalid_file_paths,
                  create_json_schema('missing'))

    # Assert
    assert 'Path does not exist: file:{}/file/that/do/not/exist.json'.format(
        execution_dir) in str(excinfo)
예제 #3
0
def test_json_load_with_invalid_schema(spark_session, schema):
    """
    load_json should return success for an invalid schema.

    1. Valid schema that matches data
    2. Schema with Integer field defined as a string
    3. Schema with String field defined as an Integer
    4. A missing nullable field
    5. A missing non-nullable field
    """
    result_df = load_json(spark_session,
                          paths=['./tests/data/json_sample.json'],
                          json_schema=create_json_schema(schema))

    assert isinstance(result_df, DataFrame)
예제 #4
0
def test_load_json_passes_correct_parameters():
    """load_json should pass the correct parameters to the spark session."""
    # Arrange
    file_paths = ['./big_data_getl/load/test/data/sample.json']
    m_spark = mock.Mock()
    m_spark.read.schema.return_value.json.return_value = 'df_result'

    # Act
    result = load_json(m_spark, file_paths, create_json_schema())

    # Assert
    assert result == 'df_result'
    m_spark.read.schema.assert_called_with(create_valid_schema())
    m_spark.read.schema.return_value.json.assert_called_with(file_paths,
                                                             multiLine=True)
예제 #5
0
def test_load_json_valid_path(spark_session):
    """load_json should be able to load json files to a dataframe."""
    # Arrange
    file_paths = ['./tests/data/json_sample.json']

    # Act
    result_df = load_json(spark_session, file_paths, create_json_schema())

    # Assert
    assert str(result_df) == str(
        spark_session.createDataFrame([{
            'name': 'Mark Steelspitter',
            'empid': 9,
            'happy': False
        }], create_valid_schema()))
예제 #6
0
def test_load_xml_passes_correct_parameters():
    """load_xml should pass correct parameters to the spark session."""
    # Arrange
    m_spark = mock.Mock()
    m_format = m_spark.read.schema.return_value.format
    m_format.return_value.options.return_value.load.return_value = 'df_result'

    # Act
    result = load_xml(m_spark, ['./big_data_getl/load/test/data/employee.xml'],
                      create_json_schema(), 'employee')

    # Assert
    assert result == 'df_result'
    m_spark.read.schema.assert_called_with(create_valid_schema())
    m_format.assert_called_with('xml')
    m_format.return_value.options.assert_called_with(rowTag='employee')
    m_format.return_value.options.return_value.load.assert_called_with(
        './big_data_getl/load/test/data/employee.xml')