Exemplo n.º 1
0
def test_json_to_spark_schema_correct_params(m_struct):
    """json_to_spark_schema is called with right parameters and in the right order."""
    # Arrange & Act
    json_to_spark_schema(create_json_schema())

    # Assert
    m_struct.fromJson.assert_called_with(create_json_schema())
Exemplo n.º 2
0
def test_json_to_spark_invalid_json(invalid_json):
    """json_to_spark_schema should raise TypeError for invalid json."""
    # Arrange & Act
    with pytest.raises(TypeError) as type_error:
        json_to_spark_schema(invalid_json)

    # Assert
    assert 'Invalid json was provided' in str(type_error)
Exemplo n.º 3
0
def test_json_to_spark_schema_invalid(invalid_schema, missed_key):
    """json_to_spark_schema should raise KeyError for missing key."""
    # Arrange & Act
    with pytest.raises(KeyError) as key_error:
        json_to_spark_schema(create_json_schema(invalid_schema))

    # Assert
    assert 'Missing key: \'{0}\'. Valid format: {1}'.format(
        missed_key,
        'All schema columns must have a name, type and nullable key') in str(
            key_error)
Exemplo n.º 4
0
def test_json_to_spark_schema():
    """json_to_spark_schema should load the json schema as StructType."""
    # Arrange & Act
    result_schema = json_to_spark_schema(create_json_schema())

    # Assert
    assert isinstance(result_schema, StructType)
Exemplo n.º 5
0
def load_xml(spark: SparkSession, paths: List[str],
             json_schema: Dict[str,
                               JsonSchemaType], row_tag: str) -> DataFrame:
    """Load xml files and returns a DataFrame.

    Args:
        spark      (SparkSession): SparkSession from calling module.
        paths      (List[str]): List of paths which needs to be loaded into DF.
        schema     (Dict[str, T]): Schema defined in Spark schema type.
        row_rag    (str): Specifying the root tag for the xml document.

    Returns:
        DataFrame: Dataframe on successful load.

    """
    return (spark.read.schema(
        json_to_spark_schema(json_schema)).format('xml').options(
            rowTag=row_tag).load(','.join(paths)))
Exemplo n.º 6
0
def load_json(spark: SparkSession, paths: List[str],
              json_schema: Dict[str, JsonSchemaType]) -> DataFrame:
    """Load json files and returns a DataFrame.

     Args:
        spark       (SparkSession): SparkSession from calling module.
        paths       (List[str]): List of paths to be loaded into DF.
        json_schema (Dict[str, T]): Schema defined in json format.

    Returns:
        DataFrame: Returns dataframe on successful load.

    Raises:
        FileNotFoundError: JSON file not found.

    """
    try:
        return (spark.read.schema(json_to_spark_schema(json_schema)).json(
            paths, multiLine=True))
    except AnalysisException as spark_exception:
        LOGGING.error(str(spark_exception))
        raise FileNotFoundError(str(spark_exception))